# Modelo de Clasificacion utilizando
# Arboles y Reglas
# Entendiendo los Arboles de Decision
# los arboles de decision son clasificadores poderosos
# los cuales utilizan una estructura de arboles para
# modelar las relaciones entre las caracteristicas y
# los potenciales resultados.
# Este ejemplo utiliza la data iris que contiene
# 150 observaciones y cinco caracteristicas, una
# de ellas es un factor llamado Species y contiene
# los resultados, usaremos esta data para simular
# un modelo de arbol que tratara de predecir
# la especie
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
set.seed(1234)
# creamos un subconjunto aleatorio de la data
# y la usaremos para entrenar nuestro modelo
# en este caso el conjunto de entrenamiento
# tiene 100 observaciones
train_sample <- sample(150,100)
str(train_sample)
## int [1:100] 18 93 91 92 126 149 2 34 95 73 ...
# dividimos la data en entrenamiento y prueba
iris_train <- iris[train_sample,]
iris_test <- iris[-train_sample,]
# cargamos la libreria C50
library(C50)
# quitamos la columna 5 de la data de entrenamiento
# ya que esa columna contiene los resultados
# que deseamos pronosticar con el modelo
# y corremos el algoritmo que crea el modelo
iris_model <- C5.0(iris_train[-5],iris_train$Species)
iris_model# el modelo generado
##
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species)
##
## Classification Tree
## Number of samples: 100
## Number of predictors: 4
##
## Tree size: 3
##
## Non-standard options: attempt to group attributes
summary(iris_model)# los detalles del modelo
##
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 03 17:08:57 2016
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 100 cases (5 attributes) from undefined.data
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (38)
## Petal.Length > 1.9:
## :...Petal.Width <= 1.7: versicolor (30/2)
## Petal.Width > 1.7: virginica (32/1)
##
##
## Evaluation on training data (100 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 3 3( 3.0%) <<
##
##
## (a) (b) (c) <-classified as
## ---- ---- ----
## 38 (a): class setosa
## 28 1 (b): class versicolor
## 2 31 (c): class virginica
##
##
## Attribute usage:
##
## 100.00% Petal.Length
## 62.00% Petal.Width
##
##
## Time: 0.0 secs
# usamos el modelo para hacer las predicciones
iris_pred <- predict(iris_model,iris_test)
# generaremos una Tabla Cruzada para analizar
# los resultados, cargamos la libreria gmodels
library(gmodels)
CrossTable(iris_test$Species,iris_pred,
prop.chisq = FALSE,prop.c = FALSE,
prop.r = FALSE,
dnn = c("Especie Actual","Prediccion"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 50
##
##
## | Prediccion
## Especie Actual | setosa | versicolor | virginica | Row Total |
## ---------------|------------|------------|------------|------------|
## setosa | 12 | 0 | 0 | 12 |
## | 0.240 | 0.000 | 0.000 | |
## ---------------|------------|------------|------------|------------|
## versicolor | 0 | 21 | 0 | 21 |
## | 0.000 | 0.420 | 0.000 | |
## ---------------|------------|------------|------------|------------|
## virginica | 0 | 3 | 14 | 17 |
## | 0.000 | 0.060 | 0.280 | |
## ---------------|------------|------------|------------|------------|
## Column Total | 12 | 24 | 14 | 50 |
## ---------------|------------|------------|------------|------------|
##
##
# los resultados en la diagonal principal
# muestran los aciertos del modelo solamente
# tres de 17 para virginica resultaron errados
# Uso de boosting para mejorar el modelo
iris_model_boost10 <- C5.0(iris_train[-5],iris_train$Species,
trials = 10)
iris_model_boost10
##
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species, trials = 10)
##
## Classification Tree
## Number of samples: 100
## Number of predictors: 4
##
## Number of boosting iterations: 10
## Average tree size: 3.9
##
## Non-standard options: attempt to group attributes
summary(iris_model_boost10)
##
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species, trials = 10)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 03 17:08:57 2016
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 100 cases (5 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (38)
## Petal.Length > 1.9:
## :...Petal.Width <= 1.7: versicolor (30/2)
## Petal.Width > 1.7: virginica (32/1)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (28.8)
## Petal.Length > 1.9:
## :...Petal.Length <= 4.9: versicolor (31.6/2.3)
## Petal.Length > 4.9: virginica (39.6/0.8)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (21.8)
## Petal.Length > 1.9:
## :...Petal.Length <= 4.7: versicolor (14.4)
## Petal.Length > 4.7: virginica (63.8/14.5)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (17.3)
## Petal.Length > 1.9:
## :...Petal.Length <= 5: versicolor (65.6/22)
## Petal.Length > 5: virginica (17.1)
##
## ----- Trial 4: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (14.2)
## Petal.Length > 1.9:
## :...Petal.Width > 1.8: virginica (21.4)
## Petal.Width <= 1.8:
## :...Sepal.Length <= 5.9: versicolor (15.1)
## Sepal.Length > 5.9:
## :...Sepal.Length <= 6.4: virginica (33.5/5.6)
## Sepal.Length > 6.4: versicolor (15.8/0.7)
##
## ----- Trial 5: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (10.9)
## Petal.Length > 1.9:
## :...Petal.Length <= 4.7: versicolor (20)
## Petal.Length > 4.7:
## :...Petal.Length > 5: virginica (17.2)
## Petal.Length <= 5:
## :...Sepal.Length <= 6.2: virginica (32.5/7)
## Sepal.Length > 6.2: versicolor (19.5/2.1)
##
## ----- Trial 6: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (8.4)
## Petal.Length > 1.9:
## :...Petal.Width > 1.8: virginica (23.4)
## Petal.Width <= 1.8:
## :...Petal.Length > 5: virginica (8.4)
## Petal.Length <= 5:
## :...Sepal.Width <= 2.2: virginica (4.8/0.2)
## Sepal.Width > 2.2: versicolor (55/9)
##
## ----- Trial 7: -----
##
## Decision tree:
##
## Sepal.Width > 3: versicolor (25.6/10)
## Sepal.Width <= 3:
## :...Petal.Width <= 1.7: versicolor (33.6/6.6)
## Petal.Width > 1.7: virginica (40.8)
##
## ----- Trial 8: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (18.1)
## Petal.Length > 1.9:
## :...Sepal.Length <= 5.9: versicolor (22.5/4.1)
## Sepal.Length > 5.9:
## :...Petal.Width <= 1.7: versicolor (21.9/6.1)
## Petal.Width > 1.7: virginica (37.6)
##
## ----- Trial 9: -----
##
## Decision tree:
##
## Petal.Length <= 1.9: setosa (14.1)
## Petal.Length > 1.9:
## :...Petal.Width <= 1.4: versicolor (11.2)
## Petal.Width > 1.4:
## :...Petal.Width > 1.8: virginica (28.9)
## Petal.Width <= 1.8:
## :...Sepal.Length <= 5.9: versicolor (8.6)
## Sepal.Length > 5.9: virginica (37.2/6.7)
##
##
## Evaluation on training data (100 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 3 3( 3.0%)
## 1 3 4( 4.0%)
## 2 3 4( 4.0%)
## 3 3 6( 6.0%)
## 4 5 7( 7.0%)
## 5 5 2( 2.0%)
## 6 5 3( 3.0%)
## 7 3 55(55.0%)
## 8 4 6( 6.0%)
## 9 5 5( 5.0%)
## boost 0( 0.0%) <<
##
##
## (a) (b) (c) <-classified as
## ---- ---- ----
## 38 (a): class setosa
## 29 (b): class versicolor
## 33 (c): class virginica
##
##
## Attribute usage:
##
## 100.00% Sepal.Width
## 100.00% Petal.Length
## 68.00% Petal.Width
## 62.00% Sepal.Length
##
##
## Time: 0.0 secs
iris_boost_predict <- predict(iris_model_boost10,
iris_test)
CrossTable(iris_test$Species,iris_boost_predict,
prop.chisq = FALSE,prop.r = FALSE,
prop.c=FALSE,
dnn = c("Actual","Prediccion"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 50
##
##
## | Prediccion
## Actual | setosa | versicolor | virginica | Row Total |
## -------------|------------|------------|------------|------------|
## setosa | 12 | 0 | 0 | 12 |
## | 0.240 | 0.000 | 0.000 | |
## -------------|------------|------------|------------|------------|
## versicolor | 0 | 20 | 1 | 21 |
## | 0.000 | 0.400 | 0.020 | |
## -------------|------------|------------|------------|------------|
## virginica | 0 | 1 | 16 | 17 |
## | 0.000 | 0.020 | 0.320 | |
## -------------|------------|------------|------------|------------|
## Column Total | 12 | 21 | 17 | 50 |
## -------------|------------|------------|------------|------------|
##
##
# el modelo es levemente mejor que antes del boosting