Carregar o banco de dados

dataset = read.csv('titanic.csv')

Dividindo o conjunto de dados entre teste e treinamento

library(caTools)
split = sample.split(dataset$Survived, SplitRatio = 0.8)
train = subset(dataset, split == TRUE)
test = subset(dataset, split == FALSE)

Verificar a codificação das variáveis

str(train)
## 'data.frame':    713 obs. of  12 variables:
##  $ PassengerId: int  1 3 4 5 6 7 8 9 10 11 ...
##  $ Survived   : int  0 1 1 0 0 0 0 1 1 1 ...
##  $ Pclass     : int  3 3 1 3 3 1 3 3 2 3 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" "Allen, Mr. William Henry" ...
##  $ Sex        : chr  "male" "female" "female" "male" ...
##  $ Age        : num  22 26 35 35 NA 54 2 27 14 4 ...
##  $ SibSp      : int  1 0 1 0 0 0 3 0 1 1 ...
##  $ Parch      : int  0 0 0 0 0 0 1 2 0 1 ...
##  $ Ticket     : chr  "A/5 21171" "STON/O2. 3101282" "113803" "373450" ...
##  $ Fare       : num  7.25 7.92 53.1 8.05 8.46 ...
##  $ Cabin      : chr  "" "" "C123" "" ...
##  $ Embarked   : chr  "S" "S" "S" "S" ...

Transformando as variáveis que precisam estar como categóricas

train$Survived <- as.factor(train$Survived)
test$Survived <- as.factor(test$Survived)
train$Pclass <- as.factor(train$Pclass)
test$Pclass <- as.factor(test$Pclass)
train$Sex <- as.factor(train$Sex)
test$Sex <- as.factor(test$Sex)
train$Embarked <- as.factor(train$Embarked)
test$Embarked <- as.factor(test$Embarked)

Carregando pacotes

install.packages("caret")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("Amelia")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('pROC')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.3, built: 2024-11-07)
## ## Copyright (C) 2005-2025 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Definindo a semente

set.seed(123)

Verificando se há linhas com valores ausentes e removendo-as

sum(is.na(train))
## [1] 147
train <- na.omit(train)

Temos que mudar a variável quando usa twoClassSummary

levels(train$Survived) <- c("M", "S")
levels(test$Survived) <- c("M", "S")

Vamos utilizar uma validação-cruzada 10-fold

ctrl <- trainControl(method = "cv", 
                     number = 10,
                     summaryFunction = twoClassSummary, 
                     classProbs = TRUE)
dtFit <- train(Survived ~ .,
               method     = "rpart2", 
               tuneLength = 20,
               trControl  = ctrl,
               metric = "ROC",
               data = train)
## note: only 8 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 8 .
dtFit
## CART 
## 
## 566 samples
##  11 predictor
##   2 classes: 'M', 'S' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 509, 510, 509, 510, 509, 510, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  ROC        Sens       Spec     
##    1        0.7468176  0.8403743  0.6532609
##    2        0.7694645  0.8674688  0.6239130
##    3        0.7876737  0.9129234  0.5775362
##    4        0.7932923  0.9038324  0.5900362
##    6        0.7919289  0.8735294  0.6327899
##    7        0.7959060  0.8408200  0.6458333
##   12        0.8106004  0.8377005  0.6545290
##   13        0.8106004  0.8377005  0.6545290
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 12.
plot(dtFit)

Desenho da arvore

install.packages("rpart.plot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(rpart.plot)
## Loading required package: rpart
rpart.plot(dtFit$finalModel, 
           cex = 0.7, 
           extra = 4, 
           type = 1,  
           box.palette = "RdYlGn")