catboost1

library(catboost)
library(caret)
library(titanic)

train_path = system.file("extdata", "adult_train.1000", package="catboost")
test_path = system.file("extdata", "adult_test.1000", package="catboost")

column_description_vector = rep('numeric', 15)
cat_features <- c(3, 5, 7, 8, 9, 10, 11, 15)
for (i in cat_features)
  column_description_vector[i] <- 'factor'

train <- read.table(train_path, head = F, sep = "\t", colClasses = column_description_vector, na.strings='NAN')
test <- read.table(test_path, head = F, sep = "\t", colClasses = column_description_vector, na.strings='NAN')
target <- c(1)
train_pool <- catboost.load_pool(data=train[,-target], label = train[,target])
test_pool <- catboost.load_pool(data=test[,-target], label = test[,target])


fit_params <- list(iterations = 100,
                   thread_count = 10,
                   loss_function = 'Logloss',
                   ignored_features = c(4,9),
                   border_count = 32,
                   depth = 5,
                   learning_rate = 0.03,
                   l2_leaf_reg = 3.5,
                   train_dir = 'train_dir',
                   logging_level = 'Silent')
model <- catboost.train(train_pool, test_pool, fit_params)


calc_accuracy <- function(prediction, expected) {
  labels <- ifelse(prediction > 0.5, 1, -1)
  accuracy <- sum(labels == expected) / length(labels)
  return(accuracy)
}

prediction <- catboost.predict(model, test_pool, prediction_type = 'Probability')
cat("Sample predictions: ", sample(prediction, 5), "\n")

## Sample predictions:  0.9200995 0.9032273 0.5809853 0.8153078 0.7570051

labels <- catboost.predict(model, test_pool, prediction_type = 'Class')
table(labels, test[,target])

##       
## labels  -1   1
##      0 421 114
##      1  79 386

# works properly only for Logloss
accuracy <- calc_accuracy(prediction, test[,target])
cat("\nAccuracy: ", accuracy, "\n")

## 
## Accuracy:  0.807

# feature splits importances (not finished)

cat("\nFeature importances", "\n")

## 
## Feature importances

catboost.get_feature_importance(model, train_pool)

##           [,1]
## V2   4.8351601
## V3   0.6282719
## V4   0.3590358
## V5  12.0475178
## V6   0.0000000
## V7  16.7281365
## V8  14.8822650
## V9  34.3260767
## V10  0.2452127
## V11  0.0000000
## V12 11.2170518
## V13  0.2650176
## V14  4.4662540
## V15  0.0000000

cat("\nTree count: ", model$tree_count, "\n")

## 
## Tree count:  100