Getting probabilities from FFTrees

# Get the  FFTrees v1.3.0 from github
# devtools::install_github("ndphillips/FFTrees")
library(FFTrees)


# This function takes an FFTrees object as an argument, and returns (for training or test data contained in the FFTrees object), a vector of probabilities that the cases are TRUE

getFFTreesProbPred <- function(x,   # An FFTrees object
                        data = "train",     # Make predictions for training or test data?
                        tree = NULL) {   # An optional tree to select

  
if(is.null(tree)) {tree <- which(x$tree.stats$train$bacc == max(x$tree.stats$train$bacc))}

criterion <- paste(x$formula)[2]

# Get classification statistics for the tree

decisions.train <- x$decision$train[,tree]
levels.train <- x$levelout$train[,tree]

decisions.test <- x$decision$test[,tree]
levels.test <- x$levelout$test[,tree]

# Get criterion from training data
train.truth <- x$data$train[[criterion]]

if(data == "train") {
  
  decisions.data <- decisions.train
  levels.data <- levels.train
  
}

if(data == "test") {
  
  decisions.data <- decisions.test
  levels.data <- levels.test
  
}


pTRUE.pred <- rep(NA, length(decisions.data))


# Set up loop for all possibile combinations of decisions and levels
level.decision <- expand.grid(decision = c(0, 1),
                              level = 1:max(levels.train),
                              pTRUE = NA)

for(i in 1:nrow(level.decision)) {
  
  decision.i <- level.decision$decision[i]
  level.i <- level.decision$level[i]
  
  train.truth.i <- train.truth[decisions.train == decision.i & 
                                 levels.train == level.i]
  
  pTRUE.i <-  mean(train.truth.i)
  
  # Write to pTRUE.pred
  
  if(is.finite(pTRUE.i)) {
  
  pTRUE.pred[decisions.data == decision.i & 
               levels.data == level.i] <- pTRUE.i
  
  }
  
}

return(pTRUE.pred)

}

Example with heart disease

# Build trees
data.train <- heartdisease[1:150,]
data.test <- heartdisease[151:303,]
formula <- diagnosis ~.

heart.fft <- FFTrees(formula = formula, 
             data = data.train,
             data.test = data.test,
             store.data = TRUE)

# Get probability of TRUE predictions for each case in training data
train.probpred <- getFFTreesProbPred(heart.fft, data = "train")

# Now the same for the test data
test.probpred <- getFFTreesProbPred(heart.fft, data = "test")

# here's how the probability of TRUE predictions look for the test data
test.probpred

##   [1] 0.82539683 0.06666667 0.82539683 0.82539683 0.87500000 0.87500000
##   [7] 0.82539683 0.82539683 0.82539683 0.82539683 0.82539683 0.87500000
##  [13] 0.10937500 0.06666667 0.10937500 0.82539683 0.10937500 0.10937500
##  [19] 0.82539683 0.10937500 0.82539683 0.82539683 0.06666667 0.06666667
##  [25] 0.87500000 0.82539683 0.82539683 0.87500000 0.10937500 0.10937500
##  [31] 0.82539683 0.82539683 0.10937500 0.82539683 0.06666667 0.10937500
##  [37] 0.82539683 0.10937500 0.82539683 0.82539683 0.10937500 0.82539683
##  [43] 0.82539683 0.87500000 0.10937500 0.87500000 0.10937500 0.06666667
##  [49] 0.10937500 0.10937500 0.06666667 0.06666667 0.82539683 0.82539683
##  [55] 0.82539683 0.82539683 0.82539683 0.82539683 0.10937500 0.06666667
##  [61] 0.10937500 0.82539683 0.10937500 0.82539683 0.87500000 0.82539683
##  [67] 0.10937500 0.06666667 0.87500000 0.06666667 0.10937500 0.10937500
##  [73] 0.10937500 0.82539683 0.87500000 0.10937500 0.06666667 0.10937500
##  [79] 0.87500000 0.87500000 0.10937500 0.06666667 0.10937500 0.10937500
##  [85] 0.10937500 0.87500000 0.82539683 0.82539683 0.10937500 0.10937500
##  [91] 0.10937500 0.10937500 0.06666667 0.10937500 0.10937500 0.06666667
##  [97] 0.82539683 0.87500000 0.82539683 0.10937500 0.06666667 0.82539683
## [103] 0.82539683 0.10937500 0.06666667 0.10937500 0.87500000 0.10937500
## [109] 0.10937500 0.82539683 0.10937500 0.10937500 0.10937500 0.10937500
## [115] 0.87500000 0.06666667 0.06666667 0.10937500 0.82539683 0.10937500
## [121] 0.82539683 0.06666667 0.82539683 0.06666667 0.10937500 0.82539683
## [127] 0.10937500 0.10937500 0.10937500 0.06666667 0.82539683 0.10937500
## [133] 0.82539683 0.10937500 0.82539683 0.87500000 0.87500000 0.82539683
## [139] 0.82539683 0.10937500 0.82539683 0.10937500 0.06666667 0.82539683
## [145] 0.06666667 0.10937500 0.87500000 0.82539683 0.82539683 0.82539683
## [151] 0.82539683 0.10937500 0.10937500

Let’s see how ‘good’ the probability prdictions were for the training data

# crate a dataframe of true classes and predicted probabilities
trainpred.df <- data.frame("truth" = data.train[[paste(formula)[2]]],
                          "probpred" = train.probpred)

# Summarise results
aggregate(truth ~ probpred, 
          data = trainpred.df, 
          FUN = mean)

##     probpred      truth
## 1 0.06666667 0.06666667
## 2 0.10937500 0.10937500
## 3 0.82539683 0.82539683
## 4 0.87500000 0.87500000

As you can see, the probabilities match perfectly. This is because the probabilities are in fact made to be identical for training data.

Let’s see how good the probability predictions were for the test data, now, we should expect some differences between the predicted probabilities and the actual probabilities:

# crate a dataframe of true classes and predicted probabilities for test data

testpred.df <- data.frame("truth" = data.test[[paste(formula)[2]]],
                          "probpred" = test.probpred)

# Summarise results
aggregate(truth ~ probpred, 
          data = testpred.df, 
          FUN = mean)

##     probpred     truth
## 1 0.06666667 0.3750000
## 2 0.10937500 0.1607143
## 3 0.82539683 0.6851852
## 4 0.87500000 0.8947368

Here, we see some differences between the probability predictions and the data.

Getting probabilities from FFTrees

Nathaniel Phillips

5/25/2017