TITANIC PROJECT

1.Tratamento de Dados

knitr::opts_chunk$set(echo = TRUE)
library(titanic)    # loads titanic_train data frame
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## v purrr   0.3.3

## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

library(rpart)
# 3 significant digits
options(digits = 3)
# clean the data - `titanic_train` is loaded with the titanic package
titanic_clean <- titanic_train %>%
    mutate(Survived = factor(Survived),
           Embarked = factor(Embarked),
           Age = ifelse(is.na(Age), median(Age, na.rm = TRUE), Age), # NA age to median age
           FamilySize = SibSp + Parch + 1) %>%    # count family members
    select(Survived,  Sex, Pclass, Age, Fare, SibSp, Parch, FamilySize, Embarked)
dim(titanic_clean) #Exportação dos dados

## [1] 891   9

head(titanic_clean) #Exportação dos dados

##   Survived    Sex Pclass Age  Fare SibSp Parch FamilySize Embarked
## 1        0   male      3  22  7.25     1     0          2        S
## 2        1 female      1  38 71.28     1     0          2        C
## 3        1 female      3  26  7.92     0     0          1        S
## 4        1 female      1  35 53.10     1     0          2        S
## 5        0   male      3  35  8.05     0     0          1        S
## 6        0   male      3  28  8.46     0     0          1        Q

set.seed(42)
test_index <- createDataPartition(titanic_clean$Survived, times = 1, p = 0.2, list = FALSE)
test_set <- titanic_clean[test_index, ]
train_set <- titanic_clean[-test_index, ]
dim(train_set) #Tamanho do treino

## [1] 712   9

dim(test_set) #Tamanho do teste

## [1] 179   9

mean(train_set$Survived == 1)# % de individuos que sobreviveu

## [1] 0.383

2. Previsão da linha de base

#The simplest prediction method is randomly guessing the outcome without using additional predictors. These methods will help us determine whether our machine learning algorithm performs better than chance
set.seed(3)
# guess the outcome com p padrão
n <- length(test_index)
y_hat <- sample(c("0", "1"), n, replace = TRUE) %>% 
  factor(levels = levels(test_set$Survived))
mean(y_hat == test_set$Survived)

## [1] 0.542

confusionMatrix(data = y_hat,reference = test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 56 28
##          1 54 41
##                                         
##                Accuracy : 0.542         
##                  95% CI : (0.466, 0.616)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 0.98018       
##                                         
##                   Kappa : 0.097         
##                                         
##  Mcnemar's Test P-Value : 0.00577       
##                                         
##             Sensitivity : 0.509         
##             Specificity : 0.594         
##          Pos Pred Value : 0.667         
##          Neg Pred Value : 0.432         
##              Prevalence : 0.615         
##          Detection Rate : 0.313         
##    Detection Prevalence : 0.469         
##       Balanced Accuracy : 0.552         
##                                         
##        'Positive' Class : 0             
##

3. Sobrevivencia dos Passageiros e Genero

mean(train_set$Survived[train_set$Sex == 'female' ] == 1 ) # % de mulheres que sobreviveram

## [1] 0.733

mean(train_set$Survived[train_set$Sex == 'male' ] == 1  ) # % de homens que sobreviveram

## [1] 0.193

sex_model <- ifelse(test_set$Sex == "female", 1, 0)%>% 
  factor(levels = levels(test_set$Survived))    # predict Survived=1 if female, 0 if male
mean(sex_model == test_set$Survived)    # calculate accuracy

## [1] 0.81

confusionMatrix(data = sex_model,reference = test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 96 20
##          1 14 49
##                                         
##                Accuracy : 0.81          
##                  95% CI : (0.745, 0.865)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 1.35e-08      
##                                         
##                   Kappa : 0.592         
##                                         
##  Mcnemar's Test P-Value : 0.391         
##                                         
##             Sensitivity : 0.873         
##             Specificity : 0.710         
##          Pos Pred Value : 0.828         
##          Neg Pred Value : 0.778         
##              Prevalence : 0.615         
##          Detection Rate : 0.536         
##    Detection Prevalence : 0.648         
##       Balanced Accuracy : 0.791         
##                                         
##        'Positive' Class : 0             
##

train_set %>%
    group_by(Pclass) %>%
    summarize(Survived = mean(Survived == 1))

## # A tibble: 3 x 2
##   Pclass Survived
##    <int>    <dbl>
## 1      1    0.622
## 2      2    0.459
## 3      3    0.244

P_model <- ifelse(test_set$Pclass == 1, 1, 0)%>% 
  factor(levels = levels(test_set$Survived))    # predict Survived=1 if female, 0 if male
mean(P_model == test_set$Survived)

## [1] 0.682

train_set %>%
    group_by(Pclass,Sex) %>%
    summarize(Survived = mean(Survived == 1))

## # A tibble: 6 x 3
## # Groups:   Pclass [3]
##   Pclass Sex    Survived
##    <int> <chr>     <dbl>
## 1      1 female    0.962
## 2      1 male      0.363
## 3      2 female    0.914
## 4      2 male      0.159
## 5      3 female    0.487
## 6      3 male      0.140

PS_model<- ifelse((test_set$Sex == 'female' & test_set$Pclass == 1) | (test_set$Sex == 'female' & test_set$Pclass == 2),1,0)%>% 
  factor(levels = levels(test_set$Survived))

mean(PS_model==test_set$Survived)

## [1] 0.793

confusionMatrix(data = P_model,reference = test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 98 45
##          1 12 24
##                                         
##                Accuracy : 0.682         
##                  95% CI : (0.608, 0.749)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 0.0375        
##                                         
##                   Kappa : 0.262         
##                                         
##  Mcnemar's Test P-Value : 2.25e-05      
##                                         
##             Sensitivity : 0.891         
##             Specificity : 0.348         
##          Pos Pred Value : 0.685         
##          Neg Pred Value : 0.667         
##              Prevalence : 0.615         
##          Detection Rate : 0.547         
##    Detection Prevalence : 0.799         
##       Balanced Accuracy : 0.619         
##                                         
##        'Positive' Class : 0             
##

confusionMatrix(data = PS_model,reference = test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 109  36
##          1   1  33
##                                        
##                Accuracy : 0.793        
##                  95% CI : (0.727, 0.85)
##     No Information Rate : 0.615        
##     P-Value [Acc > NIR] : 2.28e-07     
##                                        
##                   Kappa : 0.518        
##                                        
##  Mcnemar's Test P-Value : 2.28e-08     
##                                        
##             Sensitivity : 0.991        
##             Specificity : 0.478        
##          Pos Pred Value : 0.752        
##          Neg Pred Value : 0.971        
##              Prevalence : 0.615        
##          Detection Rate : 0.609        
##    Detection Prevalence : 0.810        
##       Balanced Accuracy : 0.735        
##                                        
##        'Positive' Class : 0            
##

F_meas(sex_model,test_set$Survived)

## [1] 0.85

F_meas(P_model,test_set$Survived)

## [1] 0.775

F_meas(PS_model,test_set$Survived)

## [1] 0.855

4. Previsão lda e qda Utilizando o “FARE”

set.seed(1)
train_qda <- train(Survived ~Fare, method = "lda", data = train_set)
y_hat <- predict(train_qda, test_set)
confusionMatrix(data = y_hat, reference = test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107  58
##          1   3  11
##                                         
##                Accuracy : 0.659         
##                  95% CI : (0.585, 0.728)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 0.124         
##                                         
##                   Kappa : 0.155         
##                                         
##  Mcnemar's Test P-Value : 4.71e-12      
##                                         
##             Sensitivity : 0.973         
##             Specificity : 0.159         
##          Pos Pred Value : 0.648         
##          Neg Pred Value : 0.786         
##              Prevalence : 0.615         
##          Detection Rate : 0.598         
##    Detection Prevalence : 0.922         
##       Balanced Accuracy : 0.566         
##                                         
##        'Positive' Class : 0             
##

train_lda <- train(Survived ~Fare, method = "qda", data = train_set)
y_hat <- predict(train_lda, test_set)
confusionMatrix(data = y_hat, reference = test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107  57
##          1   3  12
##                                         
##                Accuracy : 0.665         
##                  95% CI : (0.591, 0.733)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 0.0951        
##                                         
##                   Kappa : 0.172         
##                                         
##  Mcnemar's Test P-Value : 7.79e-12      
##                                         
##             Sensitivity : 0.973         
##             Specificity : 0.174         
##          Pos Pred Value : 0.652         
##          Neg Pred Value : 0.800         
##              Prevalence : 0.615         
##          Detection Rate : 0.598         
##    Detection Prevalence : 0.916         
##       Balanced Accuracy : 0.573         
##                                         
##        'Positive' Class : 0             
##

5. Regressão Logisitca

set.seed(1)
train_glm <- train(Survived ~ Age, method = "glm", data = train_set)
y_hat_glm <- predict(train_glm, test_set, type = "raw")
confusionMatrix(y_hat_glm, test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 110  69
##          1   0   0
##                                         
##                Accuracy : 0.615         
##                  95% CI : (0.539, 0.686)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 0.533         
##                                         
##                   Kappa : 0             
##                                         
##  Mcnemar's Test P-Value : 2.7e-16       
##                                         
##             Sensitivity : 1.000         
##             Specificity : 0.000         
##          Pos Pred Value : 0.615         
##          Neg Pred Value :   NaN         
##              Prevalence : 0.615         
##          Detection Rate : 0.615         
##    Detection Prevalence : 1.000         
##       Balanced Accuracy : 0.500         
##                                         
##        'Positive' Class : 0             
##

set.seed(1)
train_glm <- train(Survived ~ Sex + Pclass + Fare + Age, method = "glm", data = train_set)
y_hat_glm <- predict(train_glm, test_set, type = "raw")
confusionMatrix(y_hat_glm, test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 99 21
##          1 11 48
##                                         
##                Accuracy : 0.821         
##                  95% CI : (0.757, 0.874)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 1.72e-09      
##                                         
##                   Kappa : 0.612         
##                                         
##  Mcnemar's Test P-Value : 0.112         
##                                         
##             Sensitivity : 0.900         
##             Specificity : 0.696         
##          Pos Pred Value : 0.825         
##          Neg Pred Value : 0.814         
##              Prevalence : 0.615         
##          Detection Rate : 0.553         
##    Detection Prevalence : 0.670         
##       Balanced Accuracy : 0.798         
##                                         
##        'Positive' Class : 0             
##

set.seed(1)
train_glm <- train(Survived ~ ., method = "glm", data = train_set)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

y_hat_glm <- predict(train_glm, test_set, type = "raw")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

confusionMatrix(y_hat_glm, test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 98 19
##          1 12 50
##                                         
##                Accuracy : 0.827         
##                  95% CI : (0.763, 0.879)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 5.83e-10      
##                                         
##                   Kappa : 0.627         
##                                         
##  Mcnemar's Test P-Value : 0.281         
##                                         
##             Sensitivity : 0.891         
##             Specificity : 0.725         
##          Pos Pred Value : 0.838         
##          Neg Pred Value : 0.806         
##              Prevalence : 0.615         
##          Detection Rate : 0.547         
##    Detection Prevalence : 0.654         
##       Balanced Accuracy : 0.808         
##                                         
##        'Positive' Class : 0             
##

6. KNN

set.seed(8)
train.control <- trainControl(method = "cv", number = 10, p = .9)

train_knn <- train(Survived ~ ., method = "knn", tuneGrid = data.frame(k = seq(3, 51, 2)),
                   data = train_set,trControl = train.control)
y_hat_knn_ks <- predict(train_knn, test_set)
confusionMatrix(data=y_hat_knn_ks, test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 99 36
##          1 11 33
##                                       
##                Accuracy : 0.737       
##                  95% CI : (0.667, 0.8)
##     No Information Rate : 0.615       
##     P-Value [Acc > NIR] : 0.000361    
##                                       
##                   Kappa : 0.406       
##                                       
##  Mcnemar's Test P-Value : 0.000464    
##                                       
##             Sensitivity : 0.900       
##             Specificity : 0.478       
##          Pos Pred Value : 0.733       
##          Neg Pred Value : 0.750       
##              Prevalence : 0.615       
##          Detection Rate : 0.553       
##    Detection Prevalence : 0.754       
##       Balanced Accuracy : 0.689       
##                                       
##        'Positive' Class : 0           
##

head(y_hat_knn_ks) #Resultados com Cross-Validation

## [1] 0 1 1 1 1 0
## Levels: 0 1

train_knn$bestTune

##     k
## 11 23

train_knn$finalModel

## 23-nearest neighbor model
## Training set outcome distribution:
## 
##   0   1 
## 439 273

train_knn$results %>% 
  ggplot(aes(x = k, y = Accuracy)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(x = k, 
                    ymin = Accuracy - AccuracySD,
                    ymax = Accuracy + AccuracySD))

ggplot(train_knn, highlight = TRUE)

7. Arvore de Decisção

set.seed(10)
# Decision Tree With Cross-Validation
train_rpart <- train(Survived ~ .,
                     method = "rpart",
                     tuneGrid = data.frame(cp = seq(0, 0.05, 0.002)),
                     data = train_set)
head(train_rpart)

## $method
## [1] "rpart"
## 
## $modelInfo
## $modelInfo$label
## [1] "CART"
## 
## $modelInfo$library
## [1] "rpart"
## 
## $modelInfo$type
## [1] "Regression"     "Classification"
## 
## $modelInfo$parameters
##   parameter   class                label
## 1        cp numeric Complexity Parameter
## 
## $modelInfo$grid
## function(x, y, len = NULL, search = "grid"){
##                     dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
##                     dat$.outcome <- y
##                     initialFit <- rpart::rpart(.outcome ~ .,
##                                                data = dat,
##                                                control = rpart::rpart.control(cp = 0))$cptable
##                     initialFit <- initialFit[order(-initialFit[,"CP"]), , drop = FALSE]
##                     if(search == "grid") {
##                       if(nrow(initialFit) < len) {
##                         tuneSeq <- data.frame(cp = seq(min(initialFit[, "CP"]),
##                                                        max(initialFit[, "CP"]),
##                                                        length = len))
##                       } else tuneSeq <-  data.frame(cp = initialFit[1:len,"CP"])
##                       colnames(tuneSeq) <- "cp"
##                     } else {
##                       tuneSeq <- data.frame(cp = unique(sample(initialFit[, "CP"], size = len, replace = TRUE)))
##                     }
## 
##                     tuneSeq
##                   }
## 
## $modelInfo$loop
## function(grid) {
##                     grid <- grid[order(grid$cp, decreasing = FALSE),, drop = FALSE]
##                     loop <- grid[1,,drop = FALSE]
##                     submodels <- list(grid[-1,,drop = FALSE])
##                     list(loop = loop, submodels = submodels)
##                   }
## 
## $modelInfo$fit
## function(x, y, wts, param, lev, last, classProbs, ...) {
##                     cpValue <- if(!last) param$cp else 0
##                     theDots <- list(...)
##                     if(any(names(theDots) == "control"))
##                     {
##                       theDots$control$cp <- cpValue
##                       theDots$control$xval <- 0
##                       ctl <- theDots$control
##                       theDots$control <- NULL
##                     } else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)
## 
##                     ## check to see if weights were passed in (and availible)
##                     if(!is.null(wts)) theDots$weights <- wts
## 
##                     modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
##                                         data = if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE),
##                                         control = ctl),
##                                    theDots)
##                     modelArgs$data$.outcome <- y
## 
##                     out <- do.call(rpart::rpart, modelArgs)
## 
##                     if(last) out <- rpart::prune.rpart(out, cp = param$cp)
##                     out
##                   }
## <bytecode: 0x0000000024a23a58>
## 
## $modelInfo$predict
## function(modelFit, newdata, submodels = NULL) {
##                     if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
## 
##                     pType <- if(modelFit$problemType == "Classification") "class" else "vector"
##                     out  <- predict(modelFit, newdata, type=pType)
## 
##                     if(!is.null(submodels))
##                     {
##                       tmp <- vector(mode = "list", length = nrow(submodels) + 1)
##                       tmp[[1]] <- out
##                       for(j in seq(along = submodels$cp))
##                       {
##                         prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
##                         tmp[[j+1]]  <- predict(prunedFit, newdata, type=pType)
##                       }
##                       out <- tmp
##                     }
##                     out
##                   }
## <bytecode: 0x0000000025c0b9a0>
## 
## $modelInfo$prob
## function(modelFit, newdata, submodels = NULL) {
##                     if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
##                     out <- predict(modelFit, newdata, type = "prob")
## 
##                     if(!is.null(submodels))
##                     {
##                       tmp <- vector(mode = "list", length = nrow(submodels) + 1)
##                       tmp[[1]] <- out
##                       for(j in seq(along = submodels$cp))
##                       {
##                         prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
##                         tmpProb <- predict(prunedFit, newdata, type = "prob")
##                         tmp[[j+1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, drop = FALSE], stringsAsFactors = TRUE)
##                       }
##                       out <- tmp
##                     }
##                     out
##                   }
## 
## $modelInfo$predictors
## function(x, surrogate = TRUE, ...)  {
##                     out <- as.character(x$frame$var)
##                     out <- out[!(out %in% c("<leaf>"))]
##                     if(surrogate)
##                     {
##                       splits <- x$splits
##                       splits <- splits[splits[,"adj"] > 0,]
##                       out <- c(out, rownames(splits))
##                     }
##                     unique(out)
##                   }
## 
## $modelInfo$varImp
## function(object, surrogates = FALSE, competes = TRUE, ...) {
##                     if(nrow(object$splits)>0) {
##                       tmp <- rownames(object$splits)
##                       rownames(object$splits) <- 1:nrow(object$splits)
##                       splits <- data.frame(object$splits)
##                       splits$var <- tmp
##                       splits$type <- ""
## 
##                       frame <- as.data.frame(object$frame, stringsAsFactors = TRUE)
##                       index <- 0
##                       for(i in 1:nrow(frame)) {
##                         if(frame$var[i] != "<leaf>") {
##                           index <- index + 1
##                           splits$type[index] <- "primary"
##                           if(frame$ncompete[i] > 0) {
##                             for(j in 1:frame$ncompete[i]) {
##                               index <- index + 1
##                               splits$type[index] <- "competing"
##                             }
##                           }
##                           if(frame$nsurrogate[i] > 0) {
##                             for(j in 1:frame$nsurrogate[i]) {
##                               index <- index + 1
##                               splits$type[index] <- "surrogate"
##                             }
##                           }
##                         }
##                       }
##                       splits$var <- factor(as.character(splits$var))
##                       if(!surrogates) splits <- subset(splits, type != "surrogate")
##                       if(!competes) splits <- subset(splits, type != "competing")
##                       out <- aggregate(splits$improve,
##                                        list(Variable = splits$var),
##                                        sum,
##                                        na.rm = TRUE)
##                     } else {
##            out <- data.frame(x = numeric(), Variable = character())
##          }
##                     allVars <- colnames(attributes(object$terms)$factors)
##                     if(!all(allVars %in% out$Variable)) {
##                       missingVars <- allVars[!(allVars %in% out$Variable)]
##                       zeros <- data.frame(x = rep(0, length(missingVars)),
##                                           Variable = missingVars)
##                       out <- rbind(out, zeros)
##                     }
##                     out2 <- data.frame(Overall = out$x)
##                     rownames(out2) <- out$Variable
##                     out2
##                   }
## 
## $modelInfo$levels
## function(x) x$obsLevels
## 
## $modelInfo$trim
## function(x) {
##                     x$call <- list(na.action = (x$call)$na.action)
##                     x$x <- NULL
##                     x$y <- NULL
##                     x$where <- NULL
##                     x
##                   }
## 
## $modelInfo$tags
## [1] "Tree-Based Model"              "Implicit Feature Selection"   
## [3] "Handle Missing Predictor Data" "Accepts Case Weights"         
## 
## $modelInfo$sort
## function(x) x[order(x[,1], decreasing = TRUE),]
## 
## 
## $modelType
## [1] "Classification"
## 
## $results
##       cp Accuracy Kappa AccuracySD KappaSD
## 1  0.000    0.775 0.518     0.0251  0.0541
## 2  0.002    0.780 0.527     0.0221  0.0483
## 3  0.004    0.787 0.539     0.0220  0.0464
## 4  0.006    0.789 0.543     0.0220  0.0429
## 5  0.008    0.788 0.540     0.0213  0.0433
## 6  0.010    0.791 0.544     0.0236  0.0490
## 7  0.012    0.792 0.544     0.0244  0.0512
## 8  0.014    0.792 0.544     0.0246  0.0518
## 9  0.016    0.790 0.537     0.0239  0.0501
## 10 0.018    0.793 0.541     0.0229  0.0496
## 11 0.020    0.793 0.539     0.0251  0.0554
## 12 0.022    0.792 0.536     0.0281  0.0598
## 13 0.024    0.789 0.531     0.0293  0.0613
## 14 0.026    0.788 0.529     0.0299  0.0612
## 15 0.028    0.786 0.522     0.0284  0.0586
## 16 0.030    0.784 0.521     0.0266  0.0564
## 17 0.032    0.784 0.520     0.0265  0.0557
## 18 0.034    0.783 0.518     0.0266  0.0559
## 19 0.036    0.781 0.514     0.0272  0.0568
## 20 0.038    0.781 0.514     0.0272  0.0568
## 21 0.040    0.778 0.510     0.0257  0.0545
## 22 0.042    0.778 0.510     0.0257  0.0545
## 23 0.044    0.776 0.507     0.0256  0.0542
## 24 0.046    0.775 0.506     0.0254  0.0546
## 25 0.048    0.774 0.502     0.0244  0.0524
## 26 0.050    0.774 0.503     0.0244  0.0543
## 
## $pred
## NULL
## 
## $bestTune
##      cp
## 11 0.02

ggplot(train_rpart)

confusionMatrix(predict(train_rpart, test_set), test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 102  19
##          1   8  50
##                                         
##                Accuracy : 0.849         
##                  95% CI : (0.788, 0.898)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 5.17e-12      
##                                         
##                   Kappa : 0.672         
##                                         
##  Mcnemar's Test P-Value : 0.0543        
##                                         
##             Sensitivity : 0.927         
##             Specificity : 0.725         
##          Pos Pred Value : 0.843         
##          Neg Pred Value : 0.862         
##              Prevalence : 0.615         
##          Detection Rate : 0.570         
##    Detection Prevalence : 0.676         
##       Balanced Accuracy : 0.826         
##                                         
##        'Positive' Class : 0             
##

plot(train_rpart$finalModel , margin = 0.1)
text(train_rpart$finalModel , cex = 0.75)

8. Floresta Aleatoria

set.seed(14)
fit <- with(train_set, 
              train(Survived ~ ., method = "rf", 
                    data = train_set,
                    tuneGrid = data.frame(mtry = seq(1, 7, 1)),
                    ntree=100))
  confusionMatrix(predict(fit, test_set), test_set$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 107  18
##          1   3  51
##                                         
##                Accuracy : 0.883         
##                  95% CI : (0.826, 0.926)
##     No Information Rate : 0.615         
##     P-Value [Acc > NIR] : 1.17e-15      
##                                         
##                   Kappa : 0.742         
##                                         
##  Mcnemar's Test P-Value : 0.00225       
##                                         
##             Sensitivity : 0.973         
##             Specificity : 0.739         
##          Pos Pred Value : 0.856         
##          Neg Pred Value : 0.944         
##              Prevalence : 0.615         
##          Detection Rate : 0.598         
##    Detection Prevalence : 0.698         
##       Balanced Accuracy : 0.856         
##                                         
##        'Positive' Class : 0             
##

  ggplot(fit)

  imp <- varImp(fit)
  imp

## rf variable importance
## 
##            Overall
## Sexmale     100.00
## Fare         61.20
## Age          42.48
## Pclass       29.70
## FamilySize   18.28
## SibSp        11.44
## Parch         8.51
## EmbarkedS     3.83
## EmbarkedC     3.55
## EmbarkedQ     0.00

TITANIC PROJECT

Luís

06/05/2020

1.Tratamento de Dados

2. Previsão da linha de base

3. Sobrevivencia dos Passageiros e Genero

4. Previsão lda e qda Utilizando o “FARE”

5. Regressão Logisitca

6. KNN

7. Arvore de Decisção

8. Floresta Aleatoria