Random Forest

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(tidyverse)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats
## lift():   purrr, caret

library(rpart.plot)

## Loading required package: rpart

library(readr)

titanic <- read_csv("~/OneDrive/Dropbox/Cursos/fiabilidad/Material para la clase/arboles/titanic.csv")

## Parsed with column specification:
## cols(
##   PassengerId = col_integer(),
##   Survived = col_integer(),
##   Pclass = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_integer(),
##   Parch = col_integer(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

titanic$Sex <- as.factor(titanic$Sex)
titanic$Cabin <- as.factor(titanic$Cabin)
titanic$Embarked <- as.factor(titanic$Embarked)
titanic$Pclass <- ordered(titanic$Pclass,
                          levels=c("3","2","1"))
titanic$Survived <- as.factor(titanic$Survived)
impute <- preProcess(titanic[,-1],method=c("knnImpute"))
titanic<- predict(impute, titanic[,-1])

control <- trainControl(method="cv", 
                        number=10, 
                        repeats=3)

rf_default <- train(Survived~Sex + Pclass + Age + SibSp + Fare , 
                    data=titanic,
                    method="rf", 
                    trControl=control)

## Loading required package: randomForest

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

print(rf_default)

## Random Forest 
## 
## 891 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 801, 803, 802, 801, 802, 802, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8327423  0.6346217
##   4     0.8305456  0.6360961
##   6     0.8226680  0.6195824
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

train_preds <- predict(rf_default, 
                       newdata=titanic, 
                       type="raw")               # Return class predictions

confusionMatrix(train_preds, titanic$Survived,positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 523  73
##          1  26 269
##                                           
##                Accuracy : 0.8889          
##                  95% CI : (0.8664, 0.9088)
##     No Information Rate : 0.6162          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7589          
##  Mcnemar's Test P-Value : 3.779e-06       
##                                           
##             Sensitivity : 0.7865          
##             Specificity : 0.9526          
##          Pos Pred Value : 0.9119          
##          Neg Pred Value : 0.8775          
##              Prevalence : 0.3838          
##          Detection Rate : 0.3019          
##    Detection Prevalence : 0.3311          
##       Balanced Accuracy : 0.8696          
##                                           
##        'Positive' Class : 1               
##

seed <- 7
metric <- "Accuracy"
set.seed(seed)
tunegrid <- expand.grid(.mtry=c(1:5))
rf_default <- train(Survived~Sex + Pclass + Age + SibSp + Fare, 
                    data=titanic, 
                    method="rf", 
                    metric=metric, 
                    tuneGrid=tunegrid, 
                    trControl=control)
print(rf_default)

## Random Forest 
## 
## 891 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 802, 801, 802, 802, 803, 802, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.7991216  0.5412911
##   2     0.8148281  0.5968956
##   3     0.8204469  0.6121958
##   4     0.8160402  0.6045974
##   5     0.8126319  0.5981923
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 3.

Random Forest

Carlos Zelada

8/6/2017