# install.packages("devtools")
# devtools::install_github("drsimonj/pipelearner")

1 Introduction

This is an example on how we could use pipelearner for a classification modelling pipeline. pipelearner has some limitations but it provides a good syntax and an introduction to working with modelling pipelines inside dataframes. We can also write thes modelling dataframes ourselves using rsample. Then we have more control over what is actually going on. And control how many models we keep in memory. There is a good introduction to pipelearner on its github page.

2 Sample Data

2.1 Table

data_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

d <- read_csv(
  data_url,
  col_names = c('id', 'thickness', 'size_uniformity',
                'shape_uniformity', 'adhesion', 'epith_size',
                'nuclei', 'chromatin', 'nucleoli', 'mitoses', 'cancer')) %>% 
  select(-id) %>%            # Remove id; not useful here
  filter(nuclei != '?') %>%  # Remove records with missing data
  mutate_all(as.numeric) %>%
  mutate(cancer = cancer == 4,
         cancer = as.numeric(cancer),
         cancer = as.factor(cancer) )  # one-hot encode 'cancer' as 1=malignant;0=benign
## Parsed with column specification:
## cols(
##   id = col_integer(),
##   thickness = col_integer(),
##   size_uniformity = col_integer(),
##   shape_uniformity = col_integer(),
##   adhesion = col_integer(),
##   epith_size = col_integer(),
##   nuclei = col_character(),
##   chromatin = col_integer(),
##   nucleoli = col_integer(),
##   mitoses = col_integer(),
##   cancer = col_integer()
## )
d = as_tibble(d)

2.2 Histograms

y_scale = scale_y_continuous( limits = c(0,nrow( filter(d, cancer==1)) ))

for(col in names(d)[-ncol(d)] ) {
  print(col)
  p = ggplot(d) +
    geom_freqpoly(aes_string(col, colour = 'cancer'), size = 1.5) +
    labs ( x = col) +
    y_scale
  
  print(p)

}
## [1] "thickness"
## [1] "size_uniformity"
## [1] "shape_uniformity"
## [1] "adhesion"
## [1] "epith_size"
## [1] "nuclei"
## [1] "chromatin"
## [1] "nucleoli"
## [1] "mitoses"

3 Pipelearner

Here we create the pipelearner object and add the cross validation pairs. We have to use set.seed() here in order to maintain consistent results. As we will see further down the performance of each model is very similar and depending on how the data is split to create the cross validation pairs a different model might come out on top. In practice one would have to do 10 times 10 validation pairs to confirm the results. pipelearner does not offer this functionality but rsample and caret do.

set.seed(1)

pl= pipelearner(d) %>%
  learn_cvpairs( crossv_kfold, k = 10 ) 

pl
## $data
## # A tibble: 683 x 10
##    thickness size_uniformity shape_uniformity adhesion epith_size nuclei
##        <dbl>           <dbl>            <dbl>    <dbl>      <dbl>  <dbl>
##  1         5               1                1        1          2      1
##  2         5               4                4        5          7     10
##  3         3               1                1        1          2      2
##  4         6               8                8        1          3      4
##  5         4               1                1        3          2      1
##  6         8              10               10        8          7     10
##  7         1               1                1        1          2     10
##  8         2               1                2        1          2      1
##  9         2               1                1        1          2      1
## 10         4               2                1        1          2      1
## # ... with 673 more rows, and 4 more variables: chromatin <dbl>,
## #   nucleoli <dbl>, mitoses <dbl>, cancer <fctr>
## 
## $cv_pairs
## # A tibble: 10 x 3
##             train           test   .id
##            <list>         <list> <chr>
##  1 <S3: resample> <S3: resample>    01
##  2 <S3: resample> <S3: resample>    02
##  3 <S3: resample> <S3: resample>    03
##  4 <S3: resample> <S3: resample>    04
##  5 <S3: resample> <S3: resample>    05
##  6 <S3: resample> <S3: resample>    06
##  7 <S3: resample> <S3: resample>    07
##  8 <S3: resample> <S3: resample>    08
##  9 <S3: resample> <S3: resample>    09
## 10 <S3: resample> <S3: resample>    10
## 
## $train_ps
## [1] 1
## 
## $models
## NULL
## 
## attr(,"class")
## [1] "pipelearner"
no_vars = ncol(d)-1
mtry_vec = c(no_vars/3, no_vars/4, no_vars/5, sqrt(no_vars))

pl_m = pl %>%
  learn_models( glm, cancer~., family = 'binomial' ) %>%
  learn_models(randomForest::randomForest,cancer~.
               , ntree = 1000
               , mtry = mtry_vec)%>%
  learn_models(svm, cancer~.
               , cost = c(.001,0.1,1,10,100)
               , kernel = c('linear', 'radial')
               , probability = T ) %>%
  learn_models(naiveBayes, cancer~.)
## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.
  # learn_models(gbm, cancer~., distribution = 'bernoulli', n.trees = 5000, interaction.depth = c(4,6), shrinkage = c(0.001, 0.01, 0.1, 0.2, 1)) %>%
    

pl_m
## $data
## # A tibble: 683 x 10
##    thickness size_uniformity shape_uniformity adhesion epith_size nuclei
##        <dbl>           <dbl>            <dbl>    <dbl>      <dbl>  <dbl>
##  1         5               1                1        1          2      1
##  2         5               4                4        5          7     10
##  3         3               1                1        1          2      2
##  4         6               8                8        1          3      4
##  5         4               1                1        3          2      1
##  6         8              10               10        8          7     10
##  7         1               1                1        1          2     10
##  8         2               1                2        1          2      1
##  9         2               1                1        1          2      1
## 10         4               2                1        1          2      1
## # ... with 673 more rows, and 4 more variables: chromatin <dbl>,
## #   nucleoli <dbl>, mitoses <dbl>, cancer <fctr>
## 
## $cv_pairs
## # A tibble: 10 x 3
##             train           test   .id
##            <list>         <list> <chr>
##  1 <S3: resample> <S3: resample>    01
##  2 <S3: resample> <S3: resample>    02
##  3 <S3: resample> <S3: resample>    03
##  4 <S3: resample> <S3: resample>    04
##  5 <S3: resample> <S3: resample>    05
##  6 <S3: resample> <S3: resample>    06
##  7 <S3: resample> <S3: resample>    07
##  8 <S3: resample> <S3: resample>    08
##  9 <S3: resample> <S3: resample>    09
## 10 <S3: resample> <S3: resample>    10
## 
## $train_ps
## [1] 1
## 
## $models
## # A tibble: 16 x 5
##    target        model     params     .f   .id
##     <chr>        <chr>     <list> <list> <chr>
##  1 cancer          glm <list [2]>  <fun>     1
##  2 cancer randomForest <list [3]>  <fun>     2
##  3 cancer randomForest <list [3]>  <fun>     3
##  4 cancer randomForest <list [3]>  <fun>     4
##  5 cancer randomForest <list [3]>  <fun>     5
##  6 cancer          svm <list [4]>  <fun>     6
##  7 cancer          svm <list [4]>  <fun>     7
##  8 cancer          svm <list [4]>  <fun>     8
##  9 cancer          svm <list [4]>  <fun>     9
## 10 cancer          svm <list [4]>  <fun>    10
## 11 cancer          svm <list [4]>  <fun>    11
## 12 cancer          svm <list [4]>  <fun>    12
## 13 cancer          svm <list [4]>  <fun>    13
## 14 cancer          svm <list [4]>  <fun>    14
## 15 cancer          svm <list [4]>  <fun>    15
## 16 cancer   naiveBayes <list [1]>  <fun>    16
## 
## attr(,"class")
## [1] "pipelearner"

Note that you can pass arguments to the model kwargs passed via the ... argument

3.1 Adding models that do not support a formula syntax such as xgboost

We have to write a wrapper that translates the formula syntax to a matrix synthax.

wr_xgboost = function(data, formula, diagnose = F, ... ){
  
  if(diagnose){
    print(data)
    print(formula)
  }
  
  data_matrix = model.matrix( formula, data)
  
  col_label = oetteR::f_manip_get_response_variable_from_formula(formula) %>%
    as.character()
  
  #xgb wants numeric 0,1 
  label = oetteR::f_manip_factor_2_numeric( data[[col_label]] )
  
  m = xgboost( data_matrix, label, ... )
  

}

pl_m_xgb = pl_m %>%
  learn_models( wr_xgboost, cancer~., nrounds = 5, objective = 'binary:logistic', diagnose = F )
## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

## Warning: `cross_d()` is deprecated; please use `cross_df()` instead.

learn() starts the final training of the models

pl_learn = pl_m_xgb %>%
  learn()
## [1]  train-error:0.022801 
## [2]  train-error:0.021173 
## [3]  train-error:0.016287 
## [4]  train-error:0.016287 
## [5]  train-error:0.014658 
## [1]  train-error:0.026059 
## [2]  train-error:0.024430 
## [3]  train-error:0.022801 
## [4]  train-error:0.016287 
## [5]  train-error:0.014658 
## [1]  train-error:0.024430 
## [2]  train-error:0.014658 
## [3]  train-error:0.013029 
## [4]  train-error:0.014658 
## [5]  train-error:0.014658 
## [1]  train-error:0.029268 
## [2]  train-error:0.022764 
## [3]  train-error:0.019512 
## [4]  train-error:0.017886 
## [5]  train-error:0.016260 
## [1]  train-error:0.026016 
## [2]  train-error:0.024390 
## [3]  train-error:0.021138 
## [4]  train-error:0.017886 
## [5]  train-error:0.016260 
## [1]  train-error:0.032520 
## [2]  train-error:0.022764 
## [3]  train-error:0.022764 
## [4]  train-error:0.017886 
## [5]  train-error:0.016260 
## [1]  train-error:0.027642 
## [2]  train-error:0.027642 
## [3]  train-error:0.021138 
## [4]  train-error:0.019512 
## [5]  train-error:0.019512 
## [1]  train-error:0.024390 
## [2]  train-error:0.014634 
## [3]  train-error:0.014634 
## [4]  train-error:0.009756 
## [5]  train-error:0.009756 
## [1]  train-error:0.024390 
## [2]  train-error:0.022764 
## [3]  train-error:0.017886 
## [4]  train-error:0.016260 
## [5]  train-error:0.016260 
## [1]  train-error:0.024390 
## [2]  train-error:0.021138 
## [3]  train-error:0.019512 
## [4]  train-error:0.017886 
## [5]  train-error:0.016260

3.2 Adding model performance metrics

3.2.1 Predictions

Adding predictions to the dataframe using modelR::add_predictions() does not return predictions in a uniform way when predicting categorical variables. We therefore have to write out own function in which we adjust predict() according to the fitted model. Ideally for categorical variables we would like to have the propability. The following function uses the correct predict() function for each model with the parameters to return class propabilities.

return_predictions = function(test, fit, model_type, formula, diagnose = F){
  
  if(diagnose == T){
    print(model_type)
    print(names(fit))
    print(nrow(test))
  }
  
  test_df = as.data.frame(test)
  
  if(model_type == 'glm'){
    preds = predict(fit, newdata =  test_df, type = 'response') 
  }
  
  if(model_type == 'randomForest'){
    preds = predict(fit, newdata = test_df, type = 'prob')[,2] 
  }
  
  if(model_type == 'wr_xgboost'){
    
    data_matrix = model.matrix( formula, test_df )
    
    preds = predict(fit, data_matrix)
  }
  
  if(model_type == 'svm'){
    pred_obj = predict(fit, newdata = test_df, probability = T )
    at = attributes(pred_obj)
    preds = at$probabilities[,2]
  }
  
  if(model_type == 'naiveBayes'){
    preds = predict(fit, newdata = test_df, type = 'raw' )[,2]
  }
  
  # this does not work yet
  if(model_type == 'gbm'){
    preds = predict(fit, newdata = test_df, type = 'response' )
  }
  
  if(! length(preds) == nrow(test_df)){
    print(model_type)
    print(length(preds))
    print(nrow(test_df))
    print(preds)
    stop('predictions and testdata do not have same lengths')
  }
  

  return( preds )
}

3.2.2 Model Metrics

The broom package is not compatible yet with all modelling packages. We therefore need to write our own performance metric functions, preferrably based on the actual predictions. For categorical response variables the AUC of an ROC plot is best used if true negative are equallally important as true positive predictions. We can easily modify the function below since the ROC package supports a number of preformance metrics. See ?ROC::performance()

return_auc = function(pred, test, response_var) {
  
  #untangle test and pred objects
  
  if(is.tibble(pred)){
    pred = pred[[1]]
  }
  
  
  pred_vals = pred

  test_vals = as.vector( test$data[test$idx, response_var] )
  test_vals = test_vals[[1]]


  pr = prediction( pred_vals, test_vals )

  p = performance( pr, measure='auc')
  return( p@y.values[[1]] )

}

3.2.3 Visualisation of Model Performance

Here we visualise the auc of each cross validation pair for each model

pl_m_t = pl_learn %>%
  mutate( pred = pmap( list(test = test, fit = fit, model_type = model)
                       , return_predictions
                       , formula = cancer~.
                       , diagnose = F)
          , models.id = as.integer(models.id)
          , models.id = as.factor(models.id)
          ) 


pl_m_t = pl_m_t %>%
  mutate( auc = pmap( list(pred, test, target), return_auc )) %>%
  unnest(auc)

# this funciton looks for pairings with significant differences
oetteR::f_plot_generate_comparison_pairs( pl_m_t, 'auc', 'models.id')
## list()
pl_m_t %>%
  ggplot( aes(models.id, auc ) ) +
  geom_boxplot( aes( fill = model ), color = 'black' ) +
  ggbeeswarm::geom_beeswarm( aes(color = cv_pairs.id), priority='density', cex=1.5 ) +
  ggpubr::stat_compare_means( ) +
  scale_fill_brewer( palette = 'Greys') +
  scale_color_manual( values = oetteR::f_plot_col_vector74(greys = F, only_unique = T) ) +
  ylim( c (0.95, 1.01) )

The high p value tells us that the performance of all models is within the same range and none is significantly better than the other. However, there seem to be some subtle differences. We see a difference between the various versions of the svm models which at first glance also seem to be the best performing models while naiveBayes seems to have the worst performance. In this case we would chose one of the simpler models glm or naiveBayes, since glm also has the best overall performance we would go with glm

Recall the model performances are really close. If we were to resplit our dataset into a new set of cross vlaidation pairs the performance ranking is sure to shift.

4 Calling a vote

Different models might catch different aspects of the data and like random forest or xgb asks all the trees they generate for their vote we can do this with our set of models that we just generated. Why this works most of the time is described here.

This practice is usually used in machine learning competitions, but it leaves you with a model that is really hard to interpret and will be difficult to put in production.

4.1 Select the best models

pl_top = pl_m_t %>%
  group_by( models.id, model ) %>%
  summarise( mean_auc = mean(auc) ) %>%
  group_by( model ) %>%
  mutate( rank = rank( desc(mean_auc) ) ) %>%
  filter( rank == 1) %>%
  arrange( desc(mean_auc) )

pl_top %>%
  select( models.id, model, mean_auc) %>%
  knitr::kable()
models.id model mean_auc
7 svm 0.9954641
1 glm 0.9947301
4 randomForest 0.9942048
17 wr_xgboost 0.9895180
16 naiveBayes 0.9858286

4.2 Vote

We first have to unnest the predictions for each cross validation pair, then we assign an id to each prediction. Then by grouping over the cross validation pair id and the observation id we can calculate a mean prediction for each single observation in the data.

Finally we add the test data for each cross validation pair and the target column to use return_auc

pl_top_vote = pl_top %>%
  left_join( pl_m_t ) %>%
  unnest(pred, .drop = FALSE) %>%
  group_by( model, cv_pairs.id) %>%
  mutate( id = row_number() ) %>%
  group_by( cv_pairs.id, id ) %>%
  summarise( mean_pred = mean(pred) )
## Joining, by = c("models.id", "model")
# we cannot group on lists so we have to extract the info an rejoin
pl_cv_info = pl_top %>%
  left_join( pl_m_t ) %>%
  ungroup() %>%
  select( cv_pairs.id, test, target) %>%
  group_by(cv_pairs.id) %>%
  mutate( rwn = row_number() ) %>%
  filter( rwn == 1) %>%
  select( - rwn )
## Joining, by = c("models.id", "model")
pl_vote_auc = pl_top_vote %>%
  group_by(cv_pairs.id) %>%
  nest( mean_pred, .key = 'pred' ) %>%
  left_join( pl_cv_info ) %>%
  mutate( auc = pmap_dbl( list(pred, test, target), return_auc)
          , models.id = 'Vote'
          , model = 'Vote' )  %>%
  select(cv_pairs.id, models.id, model, auc )
## Joining, by = "cv_pairs.id"
# rowbind with previous results

pl_vote = pl_m_t %>%
  select(cv_pairs.id, models.id, model, auc ) %>%
  bind_rows(pl_vote_auc) %>%
  mutate( models.id = oetteR::f_manip_factor_2_numeric(models.id)
          , models.id = as.factor(models.id )
          , model = fct_relevel( model, 'glm', 'randomForest', 'svm', 'naiveBayes', 'wr_xgboost') )
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in oetteR::f_manip_factor_2_numeric(models.id): NAs introduced by
## coercion

4.3 Plot Vote

pl_vote %>%
  ggplot( aes(models.id, auc ) ) +
  geom_boxplot( aes( fill = model ), color = 'black' ) +
  ggbeeswarm::geom_beeswarm( aes(color = cv_pairs.id), priority='density', cex=1.5 ) +
  ggpubr::stat_compare_means( ) +
  scale_fill_brewer( palette = 'Greys') +
  scale_color_manual( values = oetteR::f_plot_col_vector74(greys = F, only_unique = T) ) +
  ylim( c (0.95, 1.01) )

4.4 Means

pl_top = pl_vote %>%
  group_by( models.id, model ) %>%
  summarise( mean_auc = mean(auc) ) %>%
  group_by( model ) %>%
  mutate( rank = rank( desc(mean_auc) ) ) %>%
  filter( rank == 1) %>%
  arrange( desc(mean_auc) )

pl_top %>%
  select( models.id, model, mean_auc) %>%
  knitr::kable()
models.id model mean_auc
7 svm 0.9954641
1 glm 0.9947301
NA Vote 0.9942193
4 randomForest 0.9942048
17 wr_xgboost 0.9895180
16 naiveBayes 0.9858286

In this particular case voting did not improve but only averaged the results. One possibility for it failing could be, that if we look at the performance of the single cross validation pairs that they rank very similar for each model. For example the test set for cv_pairs.id == 1 seems to be particularly hard to predict.

# xgboost() saves its models on disk
if( file.exists('xgboost.model') ) file.remove('xgboost.model')
## [1] TRUE