library(catboost)
library(caret)
library(titanic)
set.seed(12345)
data <- as.data.frame(as.matrix(titanic_train), stringsAsFactors=TRUE)
age_levels <- levels(data$Age)
most_frequent_age <- which.max(table(data$Age))
data$Age[is.na(data$Age)] <- age_levels[most_frequent_age]
drop_columns = c("PassengerId", "Survived", "Name", "Ticket", "Cabin")
x <- data[,!(names(data) %in% drop_columns)]
y <- data[,c("Survived")]
fit_control <- trainControl(method = "cv",
number = 5,
classProbs = TRUE)
grid <- expand.grid(depth = c(4, 6, 8),
learning_rate = 0.1,
iterations = 100,
l2_leaf_reg = 0.1,
rsm = 0.95,
border_count = 64)
model <- train(x, as.factor(make.names(y)),
method = catboost.caret,
logging_level = 'Silent', preProc = NULL,
tuneGrid = grid, trControl = fit_control)
print(model)
## Catboost
##
## 891 samples
## 7 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 713, 713, 712, 713, 713
## Resampling results across tuning parameters:
##
## depth Accuracy Kappa
## 4 0.8057937 0.5725966
## 6 0.8103258 0.5835593
## 8 0.8080723 0.5778741
##
## Tuning parameter 'learning_rate' was held constant at a value of 0.1
##
## Tuning parameter 'rsm' was held constant at a value of 0.95
## Tuning
## parameter 'border_count' was held constant at a value of 64
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were depth = 6, learning_rate =
## 0.1, iterations = 100, l2_leaf_reg = 0.1, rsm = 0.95 and border_count = 64.
## custom variable importance
##
## Overall
## Sex 33.212
## Fare 16.739
## Pclass 16.294
## Age 11.997
## Parch 8.247
## SibSp 8.030
## Embarked 5.482