library(catboost)
library(caret)
library(titanic)
train_path = system.file("extdata", "adult_train.1000", package="catboost")
test_path = system.file("extdata", "adult_test.1000", package="catboost")
column_description_vector = rep('numeric', 15)
cat_features <- c(3, 5, 7, 8, 9, 10, 11, 15)
for (i in cat_features)
column_description_vector[i] <- 'factor'
train <- read.table(train_path, head = F, sep = "\t", colClasses = column_description_vector, na.strings='NAN')
test <- read.table(test_path, head = F, sep = "\t", colClasses = column_description_vector, na.strings='NAN')
target <- c(1)
train_pool <- catboost.load_pool(data=train[,-target], label = train[,target])
test_pool <- catboost.load_pool(data=test[,-target], label = test[,target])
fit_params <- list(iterations = 100,
thread_count = 10,
loss_function = 'Logloss',
ignored_features = c(4,9),
border_count = 32,
depth = 5,
learning_rate = 0.03,
l2_leaf_reg = 3.5,
train_dir = 'train_dir',
logging_level = 'Silent')
model <- catboost.train(train_pool, test_pool, fit_params)
calc_accuracy <- function(prediction, expected) {
labels <- ifelse(prediction > 0.5, 1, -1)
accuracy <- sum(labels == expected) / length(labels)
return(accuracy)
}
prediction <- catboost.predict(model, test_pool, prediction_type = 'Probability')
cat("Sample predictions: ", sample(prediction, 5), "\n")
## Sample predictions: 0.9200995 0.9032273 0.5809853 0.8153078 0.7570051
##
## labels -1 1
## 0 421 114
## 1 79 386
##
## Accuracy: 0.807
##
## Feature importances
## [,1]
## V2 4.8351601
## V3 0.6282719
## V4 0.3590358
## V5 12.0475178
## V6 0.0000000
## V7 16.7281365
## V8 14.8822650
## V9 34.3260767
## V10 0.2452127
## V11 0.0000000
## V12 11.2170518
## V13 0.2650176
## V14 4.4662540
## V15 0.0000000
##
## Tree count: 100