# Get the FFTrees v1.3.0 from github
# devtools::install_github("ndphillips/FFTrees")
library(FFTrees)
# This function takes an FFTrees object as an argument, and returns (for training or test data contained in the FFTrees object), a vector of probabilities that the cases are TRUE
getFFTreesProbPred <- function(x, # An FFTrees object
data = "train", # Make predictions for training or test data?
tree = NULL) { # An optional tree to select
if(is.null(tree)) {tree <- which(x$tree.stats$train$bacc == max(x$tree.stats$train$bacc))}
criterion <- paste(x$formula)[2]
# Get classification statistics for the tree
decisions.train <- x$decision$train[,tree]
levels.train <- x$levelout$train[,tree]
decisions.test <- x$decision$test[,tree]
levels.test <- x$levelout$test[,tree]
# Get criterion from training data
train.truth <- x$data$train[[criterion]]
if(data == "train") {
decisions.data <- decisions.train
levels.data <- levels.train
}
if(data == "test") {
decisions.data <- decisions.test
levels.data <- levels.test
}
pTRUE.pred <- rep(NA, length(decisions.data))
# Set up loop for all possibile combinations of decisions and levels
level.decision <- expand.grid(decision = c(0, 1),
level = 1:max(levels.train),
pTRUE = NA)
for(i in 1:nrow(level.decision)) {
decision.i <- level.decision$decision[i]
level.i <- level.decision$level[i]
train.truth.i <- train.truth[decisions.train == decision.i &
levels.train == level.i]
pTRUE.i <- mean(train.truth.i)
# Write to pTRUE.pred
if(is.finite(pTRUE.i)) {
pTRUE.pred[decisions.data == decision.i &
levels.data == level.i] <- pTRUE.i
}
}
return(pTRUE.pred)
}
Example with heart disease
# Build trees
data.train <- heartdisease[1:150,]
data.test <- heartdisease[151:303,]
formula <- diagnosis ~.
heart.fft <- FFTrees(formula = formula,
data = data.train,
data.test = data.test,
store.data = TRUE)
# Get probability of TRUE predictions for each case in training data
train.probpred <- getFFTreesProbPred(heart.fft, data = "train")
# Now the same for the test data
test.probpred <- getFFTreesProbPred(heart.fft, data = "test")
# here's how the probability of TRUE predictions look for the test data
test.probpred
## [1] 0.82539683 0.06666667 0.82539683 0.82539683 0.87500000 0.87500000
## [7] 0.82539683 0.82539683 0.82539683 0.82539683 0.82539683 0.87500000
## [13] 0.10937500 0.06666667 0.10937500 0.82539683 0.10937500 0.10937500
## [19] 0.82539683 0.10937500 0.82539683 0.82539683 0.06666667 0.06666667
## [25] 0.87500000 0.82539683 0.82539683 0.87500000 0.10937500 0.10937500
## [31] 0.82539683 0.82539683 0.10937500 0.82539683 0.06666667 0.10937500
## [37] 0.82539683 0.10937500 0.82539683 0.82539683 0.10937500 0.82539683
## [43] 0.82539683 0.87500000 0.10937500 0.87500000 0.10937500 0.06666667
## [49] 0.10937500 0.10937500 0.06666667 0.06666667 0.82539683 0.82539683
## [55] 0.82539683 0.82539683 0.82539683 0.82539683 0.10937500 0.06666667
## [61] 0.10937500 0.82539683 0.10937500 0.82539683 0.87500000 0.82539683
## [67] 0.10937500 0.06666667 0.87500000 0.06666667 0.10937500 0.10937500
## [73] 0.10937500 0.82539683 0.87500000 0.10937500 0.06666667 0.10937500
## [79] 0.87500000 0.87500000 0.10937500 0.06666667 0.10937500 0.10937500
## [85] 0.10937500 0.87500000 0.82539683 0.82539683 0.10937500 0.10937500
## [91] 0.10937500 0.10937500 0.06666667 0.10937500 0.10937500 0.06666667
## [97] 0.82539683 0.87500000 0.82539683 0.10937500 0.06666667 0.82539683
## [103] 0.82539683 0.10937500 0.06666667 0.10937500 0.87500000 0.10937500
## [109] 0.10937500 0.82539683 0.10937500 0.10937500 0.10937500 0.10937500
## [115] 0.87500000 0.06666667 0.06666667 0.10937500 0.82539683 0.10937500
## [121] 0.82539683 0.06666667 0.82539683 0.06666667 0.10937500 0.82539683
## [127] 0.10937500 0.10937500 0.10937500 0.06666667 0.82539683 0.10937500
## [133] 0.82539683 0.10937500 0.82539683 0.87500000 0.87500000 0.82539683
## [139] 0.82539683 0.10937500 0.82539683 0.10937500 0.06666667 0.82539683
## [145] 0.06666667 0.10937500 0.87500000 0.82539683 0.82539683 0.82539683
## [151] 0.82539683 0.10937500 0.10937500
Let’s see how ‘good’ the probability prdictions were for the training data
# crate a dataframe of true classes and predicted probabilities
trainpred.df <- data.frame("truth" = data.train[[paste(formula)[2]]],
"probpred" = train.probpred)
# Summarise results
aggregate(truth ~ probpred,
data = trainpred.df,
FUN = mean)
## probpred truth
## 1 0.06666667 0.06666667
## 2 0.10937500 0.10937500
## 3 0.82539683 0.82539683
## 4 0.87500000 0.87500000
As you can see, the probabilities match perfectly. This is because the probabilities are in fact made to be identical for training data.
Let’s see how good the probability predictions were for the test data, now, we should expect some differences between the predicted probabilities and the actual probabilities:
# crate a dataframe of true classes and predicted probabilities for test data
testpred.df <- data.frame("truth" = data.test[[paste(formula)[2]]],
"probpred" = test.probpred)
# Summarise results
aggregate(truth ~ probpred,
data = testpred.df,
FUN = mean)
## probpred truth
## 1 0.06666667 0.3750000
## 2 0.10937500 0.1607143
## 3 0.82539683 0.6851852
## 4 0.87500000 0.8947368
Here, we see some differences between the probability predictions and the data.