1 Overview
2 Preparations
3 The data set
4 Fit a random forest
5 Investigating cues: BreastCancer data set example
- 5.1 Main effects
- 5.2 Interactions between cues
6 Further examples
- 6.1 Continuous cues: PimaIndiansDiabetes data set example
- 6.2 Mixed-type cues: Ionosphere data set example

1 Overview

This report shows how to use the “exploratory analysis with random forests” R package (edarf; hosted on github) to understand how non-binary and continuous cues are related to a binary outcome. This can help in finding out how to binarize such cues (e.g., when constructing a FFT).

2 Preparations

You need to install the devtools R package so that you can install the edarf R package because it needs to be compiled (there is no binary). On Mac, install the Xcode command line tools.

# clear workspace
rm(list = ls())

# install `edarf` package
# devtools::install_github("zmjones/edarf")

# load packages
library(edarf)
library(dplyr)
library(randomForest)

3 The data set

We will use the “BreastCancer” dataset.

library(mlbench)
data(BreastCancer)
glimpse(BreastCancer)

## Observations: 699
## Variables:
## $ Id              (chr) "1000025", "1002945", "1015425", "1016277", "1...
## $ Cl.thickness    (fctr) 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, ...
## $ Cell.size       (fctr) 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7,...
## $ Cell.shape      (fctr) 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5,...
## $ Marg.adhesion   (fctr) 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10,...
## $ Epith.c.size    (fctr) 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, ...
## $ Bare.nuclei     (fctr) 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, ...
## $ Bl.cromatin     (fctr) 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, ...
## $ Normal.nucleoli (fctr) 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, ...
## $ Mitoses         (fctr) 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, ...
## $ Class           (fctr) benign, benign, benign, benign, benign, malig...

There are NAs.

summary(BreastCancer)

##       Id             Cl.thickness   Cell.size     Cell.shape 
##  Length:699         1      :145   1      :384   1      :353  
##  Class :character   5      :130   10     : 67   2      : 59  
##  Mode  :character   3      :108   3      : 52   10     : 58  
##                     4      : 80   2      : 45   3      : 56  
##                     10     : 69   4      : 40   4      : 44  
##                     2      : 50   5      : 30   5      : 34  
##                     (Other):117   (Other): 81   (Other): 95  
##  Marg.adhesion  Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli
##  1      :407   2      :386   1      :402   2      :166   1      :443    
##  2      : 58   3      : 72   10     :132   3      :165   10     : 61    
##  3      : 58   4      : 48   2      : 30   1      :152   3      : 44    
##  10     : 55   1      : 47   5      : 30   7      : 73   2      : 36    
##  4      : 33   6      : 41   3      : 28   4      : 40   8      : 24    
##  8      : 25   5      : 39   (Other): 61   5      : 34   6      : 22    
##  (Other): 63   (Other): 66   NA's   : 16   (Other): 69   (Other): 69    
##     Mitoses          Class    
##  1      :579   benign   :458  
##  2      : 35   malignant:241  
##  3      : 33                  
##  10     : 14                  
##  4      : 12                  
##  7      :  9                  
##  (Other): 17

We fix it using na.roughfix from randomForest.

dat <- na.roughfix(select(BreastCancer, -Id)) %>% tbl_df
# we drop the Id variable
 
# lines appear once the variables are connected
# dat <- lapply(dat, as.numeric) %>% as.data.frame %>% tbl_df
# dat$Class <- as.factor(dat$Class)

4 Fit a random forest

set.seed(1)
fit <- randomForest(
    formula = Class ~ ., 
    data = dat,
    ntree = 2000)

5 Investigating cues: BreastCancer data set example

5.1 Main effects

This is a custom function that automates things. It uses the partial_dependence function from edarf.

fnc_pd <- function(fit, dat, var.name, imp) {
    pd <- partial_dependence(
        fit, 
        data = dat, 
        var = var.name, 
        type = "prob")
    pd
    
    plot(
        pd[,c(3,2)], 
        type = "b",
        ylim = c(0, 1),
        ylab = paste0("Probability ", colnames(pd)[2]),
        main = paste0(colnames(pd)[3], " (imp = ", format(imp[imp$var.name == var.name,1], digits = 3), ")")
    )
}

What are the most promising cues?

set.seed(1)
imp <- importance(fit) %>%
    data.frame %>% 
    mutate(var.name = row.names(.)) %>%
    arrange(desc(MeanDecreaseGini))
imp

##   MeanDecreaseGini        var.name
## 1        87.552705       Cell.size
## 2        64.664279      Cell.shape
## 3        56.181850     Bare.nuclei
## 4        33.965385     Bl.cromatin
## 5        27.258346 Normal.nucleoli
## 6        21.956171    Epith.c.size
## 7        15.151963    Cl.thickness
## 8         6.218636   Marg.adhesion
## 9         2.059201         Mitoses

Let’s plot them ordered by the variable importance.

par(mfrow = c(3,3))

for (i in 1:nrow(imp)) {
    fnc_pd(fit, dat, imp$var.name[i], imp)
}

par(mfrow = c(1,1))

5.2 Interactions between cues

Are there interactions? Here intrees functions for investigating common interactions between cues could be helpful.

Let’s look at the Cell.size and Cell.shape.

pd_int <- partial_dependence(
    fit,
    data = dat,
    var = c("Cell.size",
            "Cell.shape"),
    interaction = TRUE,
    type = "prob"
) %>% tbl_df

plot_pd(pd_int)

6 Further examples

6.1 Continuous cues: PimaIndiansDiabetes data set example

data(PimaIndiansDiabetes2)
summary(PimaIndiansDiabetes2)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     insulin            mass          pedigree           age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780   Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437   1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725   Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##  NA's   :374      NA's   :11                                      
##  diabetes 
##  neg:500  
##  pos:268  
##           
##           
##           
##           
##

glimpse(PimaIndiansDiabetes2)

## Observations: 768
## Variables:
## $ pregnant (dbl) 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0,...
## $ glucose  (dbl) 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 1...
## $ pressure (dbl) 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 6...
## $ triceps  (dbl) 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 2...
## $ insulin  (dbl) NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA,...
## $ mass     (dbl) 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5,...
## $ pedigree (dbl) 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.13...
## $ age      (dbl) 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 5...
## $ diabetes (fctr) pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, ne...

dat2 <- na.roughfix(PimaIndiansDiabetes2) %>% tbl_df

set.seed(1)
fit2 <- randomForest(
    formula = diabetes ~ ., 
    data = dat2,
    ntree = 2000)

imp2 <- importance(fit2) %>%
    data.frame %>% 
    mutate(var.name = row.names(.)) %>%
    arrange(desc(MeanDecreaseGini))
imp2

##   MeanDecreaseGini var.name
## 1         89.51838  glucose
## 2         55.87957     mass
## 3         46.79497      age
## 4         43.33883 pedigree
## 5         30.24436  insulin
## 6         28.74235 pressure
## 7         27.77833 pregnant
## 8         25.37968  triceps

par(mfrow = c(3,3))

for (i in 1:nrow(imp2)) {
    fnc_pd(fit2, dat2, imp2$var.name[i], imp2)
}

par(mfrow = c(1,1))

6.2 Mixed-type cues: Ionosphere data set example

data(Ionosphere)
summary(Ionosphere)

##  V1      V2            V3                V4                 V5         
##  0: 38   0:351   Min.   :-1.0000   Min.   :-1.00000   Min.   :-1.0000  
##  1:313           1st Qu.: 0.4721   1st Qu.:-0.06474   1st Qu.: 0.4127  
##                  Median : 0.8711   Median : 0.01631   Median : 0.8092  
##                  Mean   : 0.6413   Mean   : 0.04437   Mean   : 0.6011  
##                  3rd Qu.: 1.0000   3rd Qu.: 0.19418   3rd Qu.: 1.0000  
##                  Max.   : 1.0000   Max.   : 1.00000   Max.   : 1.0000  
##        V6                V7                V8                 V9          
##  Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.00000   Min.   :-1.00000  
##  1st Qu.:-0.0248   1st Qu.: 0.2113   1st Qu.:-0.05484   1st Qu.: 0.08711  
##  Median : 0.0228   Median : 0.7287   Median : 0.01471   Median : 0.68421  
##  Mean   : 0.1159   Mean   : 0.5501   Mean   : 0.11936   Mean   : 0.51185  
##  3rd Qu.: 0.3347   3rd Qu.: 0.9692   3rd Qu.: 0.44567   3rd Qu.: 0.95324  
##  Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.00000   Max.   : 1.00000  
##       V10                V11                V12          
##  Min.   :-1.00000   Min.   :-1.00000   Min.   :-1.00000  
##  1st Qu.:-0.04807   1st Qu.: 0.02112   1st Qu.:-0.06527  
##  Median : 0.01829   Median : 0.66798   Median : 0.02825  
##  Mean   : 0.18135   Mean   : 0.47618   Mean   : 0.15504  
##  3rd Qu.: 0.53419   3rd Qu.: 0.95790   3rd Qu.: 0.48237  
##  Max.   : 1.00000   Max.   : 1.00000   Max.   : 1.00000  
##       V13               V14                V15               V16          
##  Min.   :-1.0000   Min.   :-1.00000   Min.   :-1.0000   Min.   :-1.00000  
##  1st Qu.: 0.0000   1st Qu.:-0.07372   1st Qu.: 0.0000   1st Qu.:-0.08170  
##  Median : 0.6441   Median : 0.03027   Median : 0.6019   Median : 0.00000  
##  Mean   : 0.4008   Mean   : 0.09341   Mean   : 0.3442   Mean   : 0.07113  
##  3rd Qu.: 0.9555   3rd Qu.: 0.37486   3rd Qu.: 0.9193   3rd Qu.: 0.30897  
##  Max.   : 1.0000   Max.   : 1.00000   Max.   : 1.0000   Max.   : 1.00000  
##       V17               V18                 V19         
##  Min.   :-1.0000   Min.   :-1.000000   Min.   :-1.0000  
##  1st Qu.: 0.0000   1st Qu.:-0.225690   1st Qu.: 0.0000  
##  Median : 0.5909   Median : 0.000000   Median : 0.5762  
##  Mean   : 0.3819   Mean   :-0.003617   Mean   : 0.3594  
##  3rd Qu.: 0.9357   3rd Qu.: 0.195285   3rd Qu.: 0.8993  
##  Max.   : 1.0000   Max.   : 1.000000   Max.   : 1.0000  
##       V20                V21               V22           
##  Min.   :-1.00000   Min.   :-1.0000   Min.   :-1.000000  
##  1st Qu.:-0.23467   1st Qu.: 0.0000   1st Qu.:-0.243870  
##  Median : 0.00000   Median : 0.4991   Median : 0.000000  
##  Mean   :-0.02402   Mean   : 0.3367   Mean   : 0.008296  
##  3rd Qu.: 0.13437   3rd Qu.: 0.8949   3rd Qu.: 0.188760  
##  Max.   : 1.00000   Max.   : 1.0000   Max.   : 1.000000  
##       V23               V24                V25               V26          
##  Min.   :-1.0000   Min.   :-1.00000   Min.   :-1.0000   Min.   :-1.00000  
##  1st Qu.: 0.0000   1st Qu.:-0.36689   1st Qu.: 0.0000   1st Qu.:-0.33239  
##  Median : 0.5318   Median : 0.00000   Median : 0.5539   Median :-0.01505  
##  Mean   : 0.3625   Mean   :-0.05741   Mean   : 0.3961   Mean   :-0.07119  
##  3rd Qu.: 0.9112   3rd Qu.: 0.16463   3rd Qu.: 0.9052   3rd Qu.: 0.15676  
##  Max.   : 1.0000   Max.   : 1.00000   Max.   : 1.0000   Max.   : 1.00000  
##       V27               V28                V29               V30          
##  Min.   :-1.0000   Min.   :-1.00000   Min.   :-1.0000   Min.   :-1.00000  
##  1st Qu.: 0.2864   1st Qu.:-0.44316   1st Qu.: 0.0000   1st Qu.:-0.23689  
##  Median : 0.7082   Median :-0.01769   Median : 0.4966   Median : 0.00000  
##  Mean   : 0.5416   Mean   :-0.06954   Mean   : 0.3784   Mean   :-0.02791  
##  3rd Qu.: 0.9999   3rd Qu.: 0.15354   3rd Qu.: 0.8835   3rd Qu.: 0.15407  
##  Max.   : 1.0000   Max.   : 1.00000   Max.   : 1.0000   Max.   : 1.00000  
##       V31               V32                 V33         
##  Min.   :-1.0000   Min.   :-1.000000   Min.   :-1.0000  
##  1st Qu.: 0.0000   1st Qu.:-0.242595   1st Qu.: 0.0000  
##  Median : 0.4428   Median : 0.000000   Median : 0.4096  
##  Mean   : 0.3525   Mean   :-0.003794   Mean   : 0.3494  
##  3rd Qu.: 0.8576   3rd Qu.: 0.200120   3rd Qu.: 0.8138  
##  Max.   : 1.0000   Max.   : 1.000000   Max.   : 1.0000  
##       V34            Class    
##  Min.   :-1.00000   bad :126  
##  1st Qu.:-0.16535   good:225  
##  Median : 0.00000             
##  Mean   : 0.01448             
##  3rd Qu.: 0.17166             
##  Max.   : 1.00000

glimpse(Ionosphere)

## Observations: 351
## Variables:
## $ V1    (fctr) 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1...
## $ V2    (fctr) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ V3    (dbl) 0.99539, 1.00000, 1.00000, 1.00000, 1.00000, 0.02337, 0....
## $ V4    (dbl) -0.05889, -0.18829, -0.03365, -0.45161, -0.02401, -0.005...
## $ V5    (dbl) 0.85243, 0.93035, 1.00000, 1.00000, 0.94140, -0.09924, 0...
## $ V6    (dbl) 0.02306, -0.36156, 0.00485, 1.00000, 0.06531, -0.11949, ...
## $ V7    (dbl) 0.83398, -0.10868, 1.00000, 0.71216, 0.92106, -0.00763, ...
## $ V8    (dbl) -0.37708, -0.93597, -0.12062, -1.00000, -0.23255, -0.118...
## $ V9    (dbl) 1.00000, 1.00000, 0.88965, 0.00000, 0.77152, 0.14706, 0....
## $ V10   (dbl) 0.03760, -0.04549, 0.01198, 0.00000, -0.16399, 0.06637, ...
## $ V11   (dbl) 0.85243, 0.50874, 0.73082, 0.00000, 0.52798, 0.03786, 0....
## $ V12   (dbl) -0.17755, -0.67743, 0.05346, 0.00000, -0.20275, -0.06302...
## $ V13   (dbl) 0.59755, 0.34432, 0.85443, 0.00000, 0.56409, 0.00000, 0....
## $ V14   (dbl) -0.44945, -0.69707, 0.00827, 0.00000, -0.00712, 0.00000,...
## $ V15   (dbl) 0.60536, -0.51685, 0.54591, -1.00000, 0.34395, -0.04572,...
## $ V16   (dbl) -0.38223, -0.97515, 0.00299, 0.14516, -0.27457, -0.15540...
## $ V17   (dbl) 0.84356, 0.05499, 0.83775, 0.54094, 0.52940, -0.00343, 0...
## $ V18   (dbl) -0.38542, -0.62237, -0.13644, -0.39330, -0.21780, -0.101...
## $ V19   (dbl) 0.58212, 0.33109, 0.75535, -1.00000, 0.45107, -0.11575, ...
## $ V20   (dbl) -0.32192, -1.00000, -0.08540, -0.54467, -0.17813, -0.054...
## $ V21   (dbl) 0.56971, -0.13151, 0.70887, -0.69975, 0.05982, 0.01838, ...
## $ V22   (dbl) -0.29674, -0.45300, -0.27502, 1.00000, -0.35575, 0.03669...
## $ V23   (dbl) 0.36946, -0.18056, 0.43385, 0.00000, 0.02309, 0.01519, 0...
## $ V24   (dbl) -0.47357, -0.35734, -0.12062, 0.00000, -0.52879, 0.00888...
## $ V25   (dbl) 0.56811, -0.20332, 0.57528, 1.00000, 0.03286, 0.03513, 0...
## $ V26   (dbl) -0.51171, -0.26569, -0.40220, 0.90695, -0.65158, -0.0153...
## $ V27   (dbl) 0.41078, -0.20468, 0.58984, 0.51613, 0.13290, -0.03240, ...
## $ V28   (dbl) -0.46168, -0.18401, -0.22145, 1.00000, -0.53206, 0.09223...
## $ V29   (dbl) 0.21266, -0.19040, 0.43100, 1.00000, 0.02431, -0.07859, ...
## $ V30   (dbl) -0.34090, -0.11593, -0.17365, -0.20099, -0.62197, 0.0073...
## $ V31   (dbl) 0.42267, -0.16626, 0.60436, 0.25682, -0.05707, 0.00000, ...
## $ V32   (dbl) -0.54487, -0.06288, -0.24180, 1.00000, -0.59573, 0.00000...
## $ V33   (dbl) 0.18641, -0.13738, 0.56045, -0.32382, -0.04608, -0.00039...
## $ V34   (dbl) -0.45300, -0.02447, -0.38238, 1.00000, -0.65697, 0.12011...
## $ Class (fctr) good, bad, good, bad, good, bad, good, bad, good, bad, ...

dat3 <- Ionosphere %>% tbl_df

set.seed(1)
fit3 <- randomForest(
    formula = Class ~ ., 
    data = dat3,
    ntree = 2000)


imp3 <- importance(fit3) %>%
    data.frame %>% 
    mutate(var.name = row.names(.)) %>%
    arrange(desc(MeanDecreaseGini))
imp3

##    MeanDecreaseGini var.name
## 1         21.667274       V5
## 2         14.944095       V3
## 3         14.597021       V7
## 4         14.489918      V27
## 5          7.551723       V8
## 6          6.721020       V6
## 7          6.204016       V4
## 8          5.318193      V14
## 9          4.645229      V31
## 10         4.446285      V33
## 11         4.119335      V18
## 12         4.102099      V16
## 13         4.012420      V29
## 14         3.995992       V1
## 15         3.949552      V28
## 16         3.761316      V24
## 17         3.486166      V22
## 18         3.345619      V12
## 19         2.878279      V10
## 20         2.811008      V34
## 21         2.560680      V21
## 22         2.392086      V20
## 23         2.279570       V9
## 24         2.237442      V23
## 25         2.220036      V32
## 26         2.043450      V26
## 27         1.937037      V15
## 28         1.606546      V25
## 29         1.568493      V17
## 30         1.552036      V13
## 31         1.391076      V19
## 32         1.219280      V11
## 33         1.148375      V30
## 34         0.000000       V2

par(mfrow = c(9,4))

for (i in 1:nrow(imp3)) {
    fnc_pd(fit3, dat3, imp3$var.name[i], imp3)
}

par(mfrow = c(1,1))

Understanding cues with random forests

Stefan M. Herzog

2015-08-06