#install.packages("rpart")
#install.packages("rpart.plot")
rm(list=ls())
library(data.table)
library(rpart.plot)
## Loading required package: rpart
library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
iris <- iris
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
rpartarbol_1 <- rpart(Species~., data=iris, method = "class")
rpart.plotrpart.plot(arbol_1, main = "Árbol de Clasificación: Flores")
set.seed(12345)
div <- createDataPartition(iris$Species, times = 1, p = 0.8, list = F) # dividimos la muestra en dos partes
train <- iris[div,]
test <- iris[-div,]
arbol_2 <- rpart(Species~., data = train, method = "class")
rpart.plot(arbol_2)
predict vamos a calcular la
predicción.prediccion_1 <- predict(arbol_2, newdata = test, type = "class")
matriz <- table(test$Species, prediccion_1)
matriz
## prediccion_1
## setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 0
## virginica 0 2 8
precision <- sum(diag(matriz))/sum(matriz)
precision
## [1] 0.9333333
Extra: Ejemplo 2.
data("ptitanic")
str(ptitanic)
## 'data.frame': 1309 obs. of 6 variables:
## $ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived: Factor w/ 2 levels "died","survived": 2 2 1 1 1 2 2 1 2 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : 'labelled' num 29 0.917 2 30 25 ...
## ..- attr(*, "units")= chr "Year"
## ..- attr(*, "label")= chr "Age"
## $ sibsp : 'labelled' int 0 1 1 1 1 0 1 0 2 0 ...
## ..- attr(*, "label")= chr "Number of Siblings/Spouses Aboard"
## $ parch : 'labelled' int 0 2 2 2 2 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Number of Parents/Children Aboard"
ptitanic$age <- as.numeric(ptitanic$age)
ptitanic$sibsp <- as.integer(ptitanic$sibsp)
ptitanic$parch <- as.integer(ptitanic$parch)
round(prop.table(table(ptitanic$sex, ptitanic$survived), 1), 2)
##
## died survived
## female 0.27 0.73
## male 0.81 0.19
set.seed(123)
arbol_3 <- rpart(survived ~ pclass+sex+age, data = ptitanic, cp=0.00001)
rpart.plot(arbol_3)
div <- createDataPartition(ptitanic$survived, times = 1, p = 0.8, list = F) # dividimos la muestra en dos partes
train <- ptitanic[div,]
test <- ptitanic[-div,]
arbol_4 <- rpart(survived~., data = train, cp=0.00001)
rpart.plot(arbol_4)
prediccion_2 <- predict(arbol_4, newdata = test, type = "class")
matriz_2 <- table(test$survived, prediccion_2)
matriz_2
## prediccion_2
## died survived
## died 137 24
## survived 26 74
# precisión
sum(diag(matriz_2))/sum(matriz_2)
## [1] 0.8084291
table(test$survived)
##
## died survived
## 161 100