1.Tratamento de Dados
knitr::opts_chunk$set(echo = TRUE)
library(titanic) # loads titanic_train data frame
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## v purrr 0.3.3
## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(rpart)
# 3 significant digits
options(digits = 3)
# clean the data - `titanic_train` is loaded with the titanic package
titanic_clean <- titanic_train %>%
mutate(Survived = factor(Survived),
Embarked = factor(Embarked),
Age = ifelse(is.na(Age), median(Age, na.rm = TRUE), Age), # NA age to median age
FamilySize = SibSp + Parch + 1) %>% # count family members
select(Survived, Sex, Pclass, Age, Fare, SibSp, Parch, FamilySize, Embarked)
dim(titanic_clean) #Exportação dos dados
## [1] 891 9
head(titanic_clean) #Exportação dos dados
## Survived Sex Pclass Age Fare SibSp Parch FamilySize Embarked
## 1 0 male 3 22 7.25 1 0 2 S
## 2 1 female 1 38 71.28 1 0 2 C
## 3 1 female 3 26 7.92 0 0 1 S
## 4 1 female 1 35 53.10 1 0 2 S
## 5 0 male 3 35 8.05 0 0 1 S
## 6 0 male 3 28 8.46 0 0 1 Q
set.seed(42)
test_index <- createDataPartition(titanic_clean$Survived, times = 1, p = 0.2, list = FALSE)
test_set <- titanic_clean[test_index, ]
train_set <- titanic_clean[-test_index, ]
dim(train_set) #Tamanho do treino
## [1] 712 9
dim(test_set) #Tamanho do teste
## [1] 179 9
mean(train_set$Survived == 1)# % de individuos que sobreviveu
## [1] 0.383
2. Previsão da linha de base
#The simplest prediction method is randomly guessing the outcome without using additional predictors. These methods will help us determine whether our machine learning algorithm performs better than chance
set.seed(3)
# guess the outcome com p padrão
n <- length(test_index)
y_hat <- sample(c("0", "1"), n, replace = TRUE) %>%
factor(levels = levels(test_set$Survived))
mean(y_hat == test_set$Survived)
## [1] 0.542
confusionMatrix(data = y_hat,reference = test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 56 28
## 1 54 41
##
## Accuracy : 0.542
## 95% CI : (0.466, 0.616)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 0.98018
##
## Kappa : 0.097
##
## Mcnemar's Test P-Value : 0.00577
##
## Sensitivity : 0.509
## Specificity : 0.594
## Pos Pred Value : 0.667
## Neg Pred Value : 0.432
## Prevalence : 0.615
## Detection Rate : 0.313
## Detection Prevalence : 0.469
## Balanced Accuracy : 0.552
##
## 'Positive' Class : 0
##
3. Sobrevivencia dos Passageiros e Genero
mean(train_set$Survived[train_set$Sex == 'female' ] == 1 ) # % de mulheres que sobreviveram
## [1] 0.733
mean(train_set$Survived[train_set$Sex == 'male' ] == 1 ) # % de homens que sobreviveram
## [1] 0.193
sex_model <- ifelse(test_set$Sex == "female", 1, 0)%>%
factor(levels = levels(test_set$Survived)) # predict Survived=1 if female, 0 if male
mean(sex_model == test_set$Survived) # calculate accuracy
## [1] 0.81
confusionMatrix(data = sex_model,reference = test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 96 20
## 1 14 49
##
## Accuracy : 0.81
## 95% CI : (0.745, 0.865)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 1.35e-08
##
## Kappa : 0.592
##
## Mcnemar's Test P-Value : 0.391
##
## Sensitivity : 0.873
## Specificity : 0.710
## Pos Pred Value : 0.828
## Neg Pred Value : 0.778
## Prevalence : 0.615
## Detection Rate : 0.536
## Detection Prevalence : 0.648
## Balanced Accuracy : 0.791
##
## 'Positive' Class : 0
##
train_set %>%
group_by(Pclass) %>%
summarize(Survived = mean(Survived == 1))
## # A tibble: 3 x 2
## Pclass Survived
## <int> <dbl>
## 1 1 0.622
## 2 2 0.459
## 3 3 0.244
P_model <- ifelse(test_set$Pclass == 1, 1, 0)%>%
factor(levels = levels(test_set$Survived)) # predict Survived=1 if female, 0 if male
mean(P_model == test_set$Survived)
## [1] 0.682
train_set %>%
group_by(Pclass,Sex) %>%
summarize(Survived = mean(Survived == 1))
## # A tibble: 6 x 3
## # Groups: Pclass [3]
## Pclass Sex Survived
## <int> <chr> <dbl>
## 1 1 female 0.962
## 2 1 male 0.363
## 3 2 female 0.914
## 4 2 male 0.159
## 5 3 female 0.487
## 6 3 male 0.140
PS_model<- ifelse((test_set$Sex == 'female' & test_set$Pclass == 1) | (test_set$Sex == 'female' & test_set$Pclass == 2),1,0)%>%
factor(levels = levels(test_set$Survived))
mean(PS_model==test_set$Survived)
## [1] 0.793
confusionMatrix(data = P_model,reference = test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 98 45
## 1 12 24
##
## Accuracy : 0.682
## 95% CI : (0.608, 0.749)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 0.0375
##
## Kappa : 0.262
##
## Mcnemar's Test P-Value : 2.25e-05
##
## Sensitivity : 0.891
## Specificity : 0.348
## Pos Pred Value : 0.685
## Neg Pred Value : 0.667
## Prevalence : 0.615
## Detection Rate : 0.547
## Detection Prevalence : 0.799
## Balanced Accuracy : 0.619
##
## 'Positive' Class : 0
##
confusionMatrix(data = PS_model,reference = test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 109 36
## 1 1 33
##
## Accuracy : 0.793
## 95% CI : (0.727, 0.85)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 2.28e-07
##
## Kappa : 0.518
##
## Mcnemar's Test P-Value : 2.28e-08
##
## Sensitivity : 0.991
## Specificity : 0.478
## Pos Pred Value : 0.752
## Neg Pred Value : 0.971
## Prevalence : 0.615
## Detection Rate : 0.609
## Detection Prevalence : 0.810
## Balanced Accuracy : 0.735
##
## 'Positive' Class : 0
##
F_meas(sex_model,test_set$Survived)
## [1] 0.85
F_meas(P_model,test_set$Survived)
## [1] 0.775
F_meas(PS_model,test_set$Survived)
## [1] 0.855
4. Previsão lda e qda Utilizando o “FARE”
set.seed(1)
train_qda <- train(Survived ~Fare, method = "lda", data = train_set)
y_hat <- predict(train_qda, test_set)
confusionMatrix(data = y_hat, reference = test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 107 58
## 1 3 11
##
## Accuracy : 0.659
## 95% CI : (0.585, 0.728)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 0.124
##
## Kappa : 0.155
##
## Mcnemar's Test P-Value : 4.71e-12
##
## Sensitivity : 0.973
## Specificity : 0.159
## Pos Pred Value : 0.648
## Neg Pred Value : 0.786
## Prevalence : 0.615
## Detection Rate : 0.598
## Detection Prevalence : 0.922
## Balanced Accuracy : 0.566
##
## 'Positive' Class : 0
##
train_lda <- train(Survived ~Fare, method = "qda", data = train_set)
y_hat <- predict(train_lda, test_set)
confusionMatrix(data = y_hat, reference = test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 107 57
## 1 3 12
##
## Accuracy : 0.665
## 95% CI : (0.591, 0.733)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 0.0951
##
## Kappa : 0.172
##
## Mcnemar's Test P-Value : 7.79e-12
##
## Sensitivity : 0.973
## Specificity : 0.174
## Pos Pred Value : 0.652
## Neg Pred Value : 0.800
## Prevalence : 0.615
## Detection Rate : 0.598
## Detection Prevalence : 0.916
## Balanced Accuracy : 0.573
##
## 'Positive' Class : 0
##
5. Regressão Logisitca
set.seed(1)
train_glm <- train(Survived ~ Age, method = "glm", data = train_set)
y_hat_glm <- predict(train_glm, test_set, type = "raw")
confusionMatrix(y_hat_glm, test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 110 69
## 1 0 0
##
## Accuracy : 0.615
## 95% CI : (0.539, 0.686)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 0.533
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 2.7e-16
##
## Sensitivity : 1.000
## Specificity : 0.000
## Pos Pred Value : 0.615
## Neg Pred Value : NaN
## Prevalence : 0.615
## Detection Rate : 0.615
## Detection Prevalence : 1.000
## Balanced Accuracy : 0.500
##
## 'Positive' Class : 0
##
set.seed(1)
train_glm <- train(Survived ~ Sex + Pclass + Fare + Age, method = "glm", data = train_set)
y_hat_glm <- predict(train_glm, test_set, type = "raw")
confusionMatrix(y_hat_glm, test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 21
## 1 11 48
##
## Accuracy : 0.821
## 95% CI : (0.757, 0.874)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 1.72e-09
##
## Kappa : 0.612
##
## Mcnemar's Test P-Value : 0.112
##
## Sensitivity : 0.900
## Specificity : 0.696
## Pos Pred Value : 0.825
## Neg Pred Value : 0.814
## Prevalence : 0.615
## Detection Rate : 0.553
## Detection Prevalence : 0.670
## Balanced Accuracy : 0.798
##
## 'Positive' Class : 0
##
set.seed(1)
train_glm <- train(Survived ~ ., method = "glm", data = train_set)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
y_hat_glm <- predict(train_glm, test_set, type = "raw")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
confusionMatrix(y_hat_glm, test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 98 19
## 1 12 50
##
## Accuracy : 0.827
## 95% CI : (0.763, 0.879)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 5.83e-10
##
## Kappa : 0.627
##
## Mcnemar's Test P-Value : 0.281
##
## Sensitivity : 0.891
## Specificity : 0.725
## Pos Pred Value : 0.838
## Neg Pred Value : 0.806
## Prevalence : 0.615
## Detection Rate : 0.547
## Detection Prevalence : 0.654
## Balanced Accuracy : 0.808
##
## 'Positive' Class : 0
##
6. KNN
set.seed(8)
train.control <- trainControl(method = "cv", number = 10, p = .9)
train_knn <- train(Survived ~ ., method = "knn", tuneGrid = data.frame(k = seq(3, 51, 2)),
data = train_set,trControl = train.control)
y_hat_knn_ks <- predict(train_knn, test_set)
confusionMatrix(data=y_hat_knn_ks, test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 36
## 1 11 33
##
## Accuracy : 0.737
## 95% CI : (0.667, 0.8)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 0.000361
##
## Kappa : 0.406
##
## Mcnemar's Test P-Value : 0.000464
##
## Sensitivity : 0.900
## Specificity : 0.478
## Pos Pred Value : 0.733
## Neg Pred Value : 0.750
## Prevalence : 0.615
## Detection Rate : 0.553
## Detection Prevalence : 0.754
## Balanced Accuracy : 0.689
##
## 'Positive' Class : 0
##
head(y_hat_knn_ks) #Resultados com Cross-Validation
## [1] 0 1 1 1 1 0
## Levels: 0 1
train_knn$bestTune
## k
## 11 23
train_knn$finalModel
## 23-nearest neighbor model
## Training set outcome distribution:
##
## 0 1
## 439 273
train_knn$results %>%
ggplot(aes(x = k, y = Accuracy)) +
geom_line() +
geom_point() +
geom_errorbar(aes(x = k,
ymin = Accuracy - AccuracySD,
ymax = Accuracy + AccuracySD))

ggplot(train_knn, highlight = TRUE)

7. Arvore de Decisção
set.seed(10)
# Decision Tree With Cross-Validation
train_rpart <- train(Survived ~ .,
method = "rpart",
tuneGrid = data.frame(cp = seq(0, 0.05, 0.002)),
data = train_set)
head(train_rpart)
## $method
## [1] "rpart"
##
## $modelInfo
## $modelInfo$label
## [1] "CART"
##
## $modelInfo$library
## [1] "rpart"
##
## $modelInfo$type
## [1] "Regression" "Classification"
##
## $modelInfo$parameters
## parameter class label
## 1 cp numeric Complexity Parameter
##
## $modelInfo$grid
## function(x, y, len = NULL, search = "grid"){
## dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
## dat$.outcome <- y
## initialFit <- rpart::rpart(.outcome ~ .,
## data = dat,
## control = rpart::rpart.control(cp = 0))$cptable
## initialFit <- initialFit[order(-initialFit[,"CP"]), , drop = FALSE]
## if(search == "grid") {
## if(nrow(initialFit) < len) {
## tuneSeq <- data.frame(cp = seq(min(initialFit[, "CP"]),
## max(initialFit[, "CP"]),
## length = len))
## } else tuneSeq <- data.frame(cp = initialFit[1:len,"CP"])
## colnames(tuneSeq) <- "cp"
## } else {
## tuneSeq <- data.frame(cp = unique(sample(initialFit[, "CP"], size = len, replace = TRUE)))
## }
##
## tuneSeq
## }
##
## $modelInfo$loop
## function(grid) {
## grid <- grid[order(grid$cp, decreasing = FALSE),, drop = FALSE]
## loop <- grid[1,,drop = FALSE]
## submodels <- list(grid[-1,,drop = FALSE])
## list(loop = loop, submodels = submodels)
## }
##
## $modelInfo$fit
## function(x, y, wts, param, lev, last, classProbs, ...) {
## cpValue <- if(!last) param$cp else 0
## theDots <- list(...)
## if(any(names(theDots) == "control"))
## {
## theDots$control$cp <- cpValue
## theDots$control$xval <- 0
## ctl <- theDots$control
## theDots$control <- NULL
## } else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)
##
## ## check to see if weights were passed in (and availible)
## if(!is.null(wts)) theDots$weights <- wts
##
## modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
## data = if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE),
## control = ctl),
## theDots)
## modelArgs$data$.outcome <- y
##
## out <- do.call(rpart::rpart, modelArgs)
##
## if(last) out <- rpart::prune.rpart(out, cp = param$cp)
## out
## }
## <bytecode: 0x0000000024a23a58>
##
## $modelInfo$predict
## function(modelFit, newdata, submodels = NULL) {
## if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
##
## pType <- if(modelFit$problemType == "Classification") "class" else "vector"
## out <- predict(modelFit, newdata, type=pType)
##
## if(!is.null(submodels))
## {
## tmp <- vector(mode = "list", length = nrow(submodels) + 1)
## tmp[[1]] <- out
## for(j in seq(along = submodels$cp))
## {
## prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
## tmp[[j+1]] <- predict(prunedFit, newdata, type=pType)
## }
## out <- tmp
## }
## out
## }
## <bytecode: 0x0000000025c0b9a0>
##
## $modelInfo$prob
## function(modelFit, newdata, submodels = NULL) {
## if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
## out <- predict(modelFit, newdata, type = "prob")
##
## if(!is.null(submodels))
## {
## tmp <- vector(mode = "list", length = nrow(submodels) + 1)
## tmp[[1]] <- out
## for(j in seq(along = submodels$cp))
## {
## prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
## tmpProb <- predict(prunedFit, newdata, type = "prob")
## tmp[[j+1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, drop = FALSE], stringsAsFactors = TRUE)
## }
## out <- tmp
## }
## out
## }
##
## $modelInfo$predictors
## function(x, surrogate = TRUE, ...) {
## out <- as.character(x$frame$var)
## out <- out[!(out %in% c("<leaf>"))]
## if(surrogate)
## {
## splits <- x$splits
## splits <- splits[splits[,"adj"] > 0,]
## out <- c(out, rownames(splits))
## }
## unique(out)
## }
##
## $modelInfo$varImp
## function(object, surrogates = FALSE, competes = TRUE, ...) {
## if(nrow(object$splits)>0) {
## tmp <- rownames(object$splits)
## rownames(object$splits) <- 1:nrow(object$splits)
## splits <- data.frame(object$splits)
## splits$var <- tmp
## splits$type <- ""
##
## frame <- as.data.frame(object$frame, stringsAsFactors = TRUE)
## index <- 0
## for(i in 1:nrow(frame)) {
## if(frame$var[i] != "<leaf>") {
## index <- index + 1
## splits$type[index] <- "primary"
## if(frame$ncompete[i] > 0) {
## for(j in 1:frame$ncompete[i]) {
## index <- index + 1
## splits$type[index] <- "competing"
## }
## }
## if(frame$nsurrogate[i] > 0) {
## for(j in 1:frame$nsurrogate[i]) {
## index <- index + 1
## splits$type[index] <- "surrogate"
## }
## }
## }
## }
## splits$var <- factor(as.character(splits$var))
## if(!surrogates) splits <- subset(splits, type != "surrogate")
## if(!competes) splits <- subset(splits, type != "competing")
## out <- aggregate(splits$improve,
## list(Variable = splits$var),
## sum,
## na.rm = TRUE)
## } else {
## out <- data.frame(x = numeric(), Variable = character())
## }
## allVars <- colnames(attributes(object$terms)$factors)
## if(!all(allVars %in% out$Variable)) {
## missingVars <- allVars[!(allVars %in% out$Variable)]
## zeros <- data.frame(x = rep(0, length(missingVars)),
## Variable = missingVars)
## out <- rbind(out, zeros)
## }
## out2 <- data.frame(Overall = out$x)
## rownames(out2) <- out$Variable
## out2
## }
##
## $modelInfo$levels
## function(x) x$obsLevels
##
## $modelInfo$trim
## function(x) {
## x$call <- list(na.action = (x$call)$na.action)
## x$x <- NULL
## x$y <- NULL
## x$where <- NULL
## x
## }
##
## $modelInfo$tags
## [1] "Tree-Based Model" "Implicit Feature Selection"
## [3] "Handle Missing Predictor Data" "Accepts Case Weights"
##
## $modelInfo$sort
## function(x) x[order(x[,1], decreasing = TRUE),]
##
##
## $modelType
## [1] "Classification"
##
## $results
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.000 0.775 0.518 0.0251 0.0541
## 2 0.002 0.780 0.527 0.0221 0.0483
## 3 0.004 0.787 0.539 0.0220 0.0464
## 4 0.006 0.789 0.543 0.0220 0.0429
## 5 0.008 0.788 0.540 0.0213 0.0433
## 6 0.010 0.791 0.544 0.0236 0.0490
## 7 0.012 0.792 0.544 0.0244 0.0512
## 8 0.014 0.792 0.544 0.0246 0.0518
## 9 0.016 0.790 0.537 0.0239 0.0501
## 10 0.018 0.793 0.541 0.0229 0.0496
## 11 0.020 0.793 0.539 0.0251 0.0554
## 12 0.022 0.792 0.536 0.0281 0.0598
## 13 0.024 0.789 0.531 0.0293 0.0613
## 14 0.026 0.788 0.529 0.0299 0.0612
## 15 0.028 0.786 0.522 0.0284 0.0586
## 16 0.030 0.784 0.521 0.0266 0.0564
## 17 0.032 0.784 0.520 0.0265 0.0557
## 18 0.034 0.783 0.518 0.0266 0.0559
## 19 0.036 0.781 0.514 0.0272 0.0568
## 20 0.038 0.781 0.514 0.0272 0.0568
## 21 0.040 0.778 0.510 0.0257 0.0545
## 22 0.042 0.778 0.510 0.0257 0.0545
## 23 0.044 0.776 0.507 0.0256 0.0542
## 24 0.046 0.775 0.506 0.0254 0.0546
## 25 0.048 0.774 0.502 0.0244 0.0524
## 26 0.050 0.774 0.503 0.0244 0.0543
##
## $pred
## NULL
##
## $bestTune
## cp
## 11 0.02
ggplot(train_rpart)

confusionMatrix(predict(train_rpart, test_set), test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 19
## 1 8 50
##
## Accuracy : 0.849
## 95% CI : (0.788, 0.898)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 5.17e-12
##
## Kappa : 0.672
##
## Mcnemar's Test P-Value : 0.0543
##
## Sensitivity : 0.927
## Specificity : 0.725
## Pos Pred Value : 0.843
## Neg Pred Value : 0.862
## Prevalence : 0.615
## Detection Rate : 0.570
## Detection Prevalence : 0.676
## Balanced Accuracy : 0.826
##
## 'Positive' Class : 0
##
plot(train_rpart$finalModel , margin = 0.1)
text(train_rpart$finalModel , cex = 0.75)

8. Floresta Aleatoria
set.seed(14)
fit <- with(train_set,
train(Survived ~ ., method = "rf",
data = train_set,
tuneGrid = data.frame(mtry = seq(1, 7, 1)),
ntree=100))
confusionMatrix(predict(fit, test_set), test_set$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 107 18
## 1 3 51
##
## Accuracy : 0.883
## 95% CI : (0.826, 0.926)
## No Information Rate : 0.615
## P-Value [Acc > NIR] : 1.17e-15
##
## Kappa : 0.742
##
## Mcnemar's Test P-Value : 0.00225
##
## Sensitivity : 0.973
## Specificity : 0.739
## Pos Pred Value : 0.856
## Neg Pred Value : 0.944
## Prevalence : 0.615
## Detection Rate : 0.598
## Detection Prevalence : 0.698
## Balanced Accuracy : 0.856
##
## 'Positive' Class : 0
##
ggplot(fit)

imp <- varImp(fit)
imp
## rf variable importance
##
## Overall
## Sexmale 100.00
## Fare 61.20
## Age 42.48
## Pclass 29.70
## FamilySize 18.28
## SibSp 11.44
## Parch 8.51
## EmbarkedS 3.83
## EmbarkedC 3.55
## EmbarkedQ 0.00