Carregar o banco de dados
dataset = read.csv('titanic.csv')
Dividindo o conjunto de dados entre teste e treinamento
library(caTools)
split = sample.split(dataset$Survived, SplitRatio = 0.8)
train = subset(dataset, split == TRUE)
test = subset(dataset, split == FALSE)
Verificar a codificação das variáveis
str(train)
## 'data.frame': 713 obs. of 12 variables:
## $ PassengerId: int 1 3 4 5 6 7 8 9 10 11 ...
## $ Survived : int 0 1 1 0 0 0 0 1 1 1 ...
## $ Pclass : int 3 3 1 3 3 1 3 3 2 3 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" "Allen, Mr. William Henry" ...
## $ Sex : chr "male" "female" "female" "male" ...
## $ Age : num 22 26 35 35 NA 54 2 27 14 4 ...
## $ SibSp : int 1 0 1 0 0 0 3 0 1 1 ...
## $ Parch : int 0 0 0 0 0 0 1 2 0 1 ...
## $ Ticket : chr "A/5 21171" "STON/O2. 3101282" "113803" "373450" ...
## $ Fare : num 7.25 7.92 53.1 8.05 8.46 ...
## $ Cabin : chr "" "" "C123" "" ...
## $ Embarked : chr "S" "S" "S" "S" ...
Carregando pacotes
install.packages("caret")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("Amelia")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('pROC')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.3, built: 2024-11-07)
## ## Copyright (C) 2005-2025 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Definindo a semente
set.seed(123)
Temos que mudar a variável quando usa twoClassSummary
levels(train$Survived) <- c("M", "S")
levels(test$Survived) <- c("M", "S")
Vamos utilizar uma validação-cruzada 10-fold
ctrl <- trainControl(method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE)
dtFit <- train(Survived ~ .,
method = "rpart2",
tuneLength = 20,
trControl = ctrl,
metric = "ROC",
data = train)
## note: only 8 possible values of the max tree depth from the initial fit.
## Truncating the grid to 8 .
dtFit
## CART
##
## 566 samples
## 11 predictor
## 2 classes: 'M', 'S'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 509, 510, 509, 510, 509, 510, ...
## Resampling results across tuning parameters:
##
## maxdepth ROC Sens Spec
## 1 0.7468176 0.8403743 0.6532609
## 2 0.7694645 0.8674688 0.6239130
## 3 0.7876737 0.9129234 0.5775362
## 4 0.7932923 0.9038324 0.5900362
## 6 0.7919289 0.8735294 0.6327899
## 7 0.7959060 0.8408200 0.6458333
## 12 0.8106004 0.8377005 0.6545290
## 13 0.8106004 0.8377005 0.6545290
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 12.
plot(dtFit)

Desenho da arvore
install.packages("rpart.plot")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(rpart.plot)
## Loading required package: rpart
rpart.plot(dtFit$finalModel,
cex = 0.7,
extra = 4,
type = 1,
box.palette = "RdYlGn")
