library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
## lift(): purrr, caret
library(rpart.plot)
## Loading required package: rpart
library(readr)
Clasificación Binaria
titanic <- read_csv("~/OneDrive/Dropbox/Cursos/fiabilidad/Material para la clase/arboles/titanic.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_integer(),
## Survived = col_integer(),
## Pclass = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_integer(),
## Parch = col_integer(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
titanic$Sex <- as.factor(titanic$Sex)
titanic$Cabin <- as.factor(titanic$Cabin)
titanic$Embarked <- as.factor(titanic$Embarked)
titanic$Pclass <- ordered(titanic$Pclass,
levels=c("3","2","1"))
summary(titanic)
## PassengerId Survived Pclass Name Sex
## Min. : 1.0 Min. :0.0000 3:491 Length:891 female:314
## 1st Qu.:223.5 1st Qu.:0.0000 2:184 Class :character male :577
## Median :446.0 Median :0.0000 1:216 Mode :character
## Mean :446.0 Mean :0.3838
## 3rd Qu.:668.5 3rd Qu.:1.0000
## Max. :891.0 Max. :1.0000
##
## Age SibSp Parch Ticket
## Min. : 0.42 Min. :0.000 Min. :0.0000 Length:891
## 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000 Class :character
## Median :28.00 Median :0.000 Median :0.0000 Mode :character
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Fare Cabin Embarked
## Min. : 0.00 B96 B98 : 4 C :168
## 1st Qu.: 7.91 C23 C25 C27: 4 Q : 77
## Median : 14.45 G6 : 4 S :644
## Mean : 32.20 C22 C26 : 3 NA's: 2
## 3rd Qu.: 31.00 D : 3
## Max. :512.33 (Other) :186
## NA's :687
impute <- preProcess(titanic[,c(6:8,10)],method=c("knnImpute"))
titanic_imp <- predict(impute, titanic[,c(6:8,10)])
titanic_train <- cbind(titanic[,-c(6:8,10)], titanic_imp)
gender_tree <- rpart(Survived~Sex, data=titanic)
rpart.plot(gender_tree)

gender_class_tree <- rpart(Survived~Sex+Pclass, data=titanic)
rpart.plot(gender_class_tree)

complex_tree <- rpart(Survived ~ Sex + Pclass + Age + SibSp + Fare + Embarked,
cp = 0.001, # Set complexity parameter*
data = titanic_train)
rpart.plot(complex_tree)

limited_complexity_tree <- rpart(Survived ~ Sex + Pclass + Age + SibSp +Fare+Embarked,
cp = 0.001, # Set complexity parameter
maxdepth = 5, # Set maximum tree depth
minbucket = 5, # Set min number of obs in leaf nodes
method = "class", # Return classifications instead of probs
data = titanic)
rpart.plot(limited_complexity_tree)

train_preds <- predict(limited_complexity_tree,
newdata=titanic,
type="class") # Return class predictions
confusionMatrix(train_preds, titanic_train$Survived,positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 495 93
## 1 54 249
##
## Accuracy : 0.835
## 95% CI : (0.809, 0.8588)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6435
## Mcnemar's Test P-Value : 0.001723
##
## Sensitivity : 0.7281
## Specificity : 0.9016
## Pos Pred Value : 0.8218
## Neg Pred Value : 0.8418
## Prevalence : 0.3838
## Detection Rate : 0.2795
## Detection Prevalence : 0.3401
## Balanced Accuracy : 0.8149
##
## 'Positive' Class : 1
##
Clasificación Multinomial
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
set.seed(123)
split <- createDataPartition(y=iris$Species, p=0.75, list=FALSE)
train <- iris[split,]
test <- iris[-split,]
tree.control <- trainControl(method = "cv",
number = 5)
tree.fit <- train(Species ~ . ,
data=train,
method = "rpart",
trControl = tree.control)
probsTest <- predict(tree.fit, test, type = "prob")
pred <- probsTest
pred$setosa <- ifelse(probsTest$setosa>0.5,1,0)
pred$versicolor <- ifelse(probsTest$versicolor>0.5,1,0)
pred$virginica <- ifelse(probsTest$virginica>0.5,1,0)
pred_vector<-vector()
for(i in 1:nrow(test)){
if(sum(pred[i,]==c(1,0,0)) == 3){
pred_vector[i]<-"setosa"
} else if(sum(pred[i,]==c(0,1,0))==3){
pred_vector[i] <- "versicolor"
} else {
pred_vector[i] <- "virginica"
}
}
table(pred_vector)
## pred_vector
## setosa versicolor virginica
## 12 11 13
table(test$Species)
##
## setosa versicolor virginica
## 12 12 12
table(pred_vector,test$Species,dnn=c("Prediction","True Value"))
## True Value
## Prediction setosa versicolor virginica
## setosa 12 0 0
## versicolor 0 10 1
## virginica 0 2 11
rpart.plot(tree.fit$finalModel)
