library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
## lift(): purrr, caret
library(rpart.plot)
## Loading required package: rpart
library(readr)
titanic <- read_csv("~/OneDrive/Dropbox/Cursos/fiabilidad/Material para la clase/arboles/titanic.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_integer(),
## Survived = col_integer(),
## Pclass = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_integer(),
## Parch = col_integer(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
titanic$Sex <- as.factor(titanic$Sex)
titanic$Cabin <- as.factor(titanic$Cabin)
titanic$Embarked <- as.factor(titanic$Embarked)
titanic$Pclass <- ordered(titanic$Pclass,
levels=c("3","2","1"))
titanic$Survived <- as.factor(titanic$Survived)
impute <- preProcess(titanic[,-1],method=c("knnImpute"))
titanic<- predict(impute, titanic[,-1])
control <- trainControl(method="cv",
number=10,
repeats=3)
rf_default <- train(Survived~Sex + Pclass + Age + SibSp + Fare ,
data=titanic,
method="rf",
trControl=control)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
print(rf_default)
## Random Forest
##
## 891 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 801, 803, 802, 801, 802, 802, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8327423 0.6346217
## 4 0.8305456 0.6360961
## 6 0.8226680 0.6195824
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
train_preds <- predict(rf_default,
newdata=titanic,
type="raw") # Return class predictions
confusionMatrix(train_preds, titanic$Survived,positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 523 73
## 1 26 269
##
## Accuracy : 0.8889
## 95% CI : (0.8664, 0.9088)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7589
## Mcnemar's Test P-Value : 3.779e-06
##
## Sensitivity : 0.7865
## Specificity : 0.9526
## Pos Pred Value : 0.9119
## Neg Pred Value : 0.8775
## Prevalence : 0.3838
## Detection Rate : 0.3019
## Detection Prevalence : 0.3311
## Balanced Accuracy : 0.8696
##
## 'Positive' Class : 1
##
seed <- 7
metric <- "Accuracy"
set.seed(seed)
tunegrid <- expand.grid(.mtry=c(1:5))
rf_default <- train(Survived~Sex + Pclass + Age + SibSp + Fare,
data=titanic,
method="rf",
metric=metric,
tuneGrid=tunegrid,
trControl=control)
print(rf_default)
## Random Forest
##
## 891 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 802, 801, 802, 802, 803, 802, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7991216 0.5412911
## 2 0.8148281 0.5968956
## 3 0.8204469 0.6121958
## 4 0.8160402 0.6045974
## 5 0.8126319 0.5981923
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.