# Modelo de Clasificacion utilizando
# Arboles y Reglas
# Entendiendo los Arboles de Decision
# los arboles de decision son clasificadores poderosos
# los cuales utilizan una estructura de arboles para 
# modelar las relaciones entre las caracteristicas y 
# los potenciales resultados.
# Este ejemplo utiliza la data iris que contiene
# 150 observaciones y cinco caracteristicas, una 
# de ellas es un factor llamado Species y contiene
# los resultados, usaremos esta data para simular
# un modelo de arbol que tratara de predecir
# la especie
data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
tail(iris)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
set.seed(1234)
# creamos un subconjunto aleatorio de la data
# y la usaremos para entrenar nuestro modelo
# en este caso el conjunto de entrenamiento
# tiene 100 observaciones
train_sample <- sample(150,100)
str(train_sample)
##  int [1:100] 18 93 91 92 126 149 2 34 95 73 ...
# dividimos la data en entrenamiento y prueba
iris_train <- iris[train_sample,]
iris_test <- iris[-train_sample,]
# cargamos la libreria C50
library(C50)
# quitamos la columna 5 de la data de entrenamiento
# ya que esa columna contiene los resultados
# que deseamos pronosticar con el modelo
# y corremos el algoritmo que crea el modelo
iris_model <- C5.0(iris_train[-5],iris_train$Species)
iris_model# el modelo generado
## 
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species)
## 
## Classification Tree
## Number of samples: 100 
## Number of predictors: 4 
## 
## Tree size: 3 
## 
## Non-standard options: attempt to group attributes
summary(iris_model)# los detalles del modelo
## 
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 03 17:08:57 2016
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 100 cases (5 attributes) from undefined.data
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (38)
## Petal.Length > 1.9:
## :...Petal.Width <= 1.7: versicolor (30/2)
##     Petal.Width > 1.7: virginica (32/1)
## 
## 
## Evaluation on training data (100 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       3    3( 3.0%)   <<
## 
## 
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##      38                (a): class setosa
##            28     1    (b): class versicolor
##             2    31    (c): class virginica
## 
## 
##  Attribute usage:
## 
##  100.00% Petal.Length
##   62.00% Petal.Width
## 
## 
## Time: 0.0 secs
# usamos el modelo para hacer las predicciones
iris_pred <- predict(iris_model,iris_test)
# generaremos una Tabla Cruzada para analizar
# los resultados, cargamos la libreria gmodels
library(gmodels)
CrossTable(iris_test$Species,iris_pred,
           prop.chisq = FALSE,prop.c = FALSE,
           prop.r = FALSE,
           dnn = c("Especie Actual","Prediccion"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  50 
## 
##  
##                | Prediccion 
## Especie Actual |     setosa | versicolor |  virginica |  Row Total | 
## ---------------|------------|------------|------------|------------|
##         setosa |         12 |          0 |          0 |         12 | 
##                |      0.240 |      0.000 |      0.000 |            | 
## ---------------|------------|------------|------------|------------|
##     versicolor |          0 |         21 |          0 |         21 | 
##                |      0.000 |      0.420 |      0.000 |            | 
## ---------------|------------|------------|------------|------------|
##      virginica |          0 |          3 |         14 |         17 | 
##                |      0.000 |      0.060 |      0.280 |            | 
## ---------------|------------|------------|------------|------------|
##   Column Total |         12 |         24 |         14 |         50 | 
## ---------------|------------|------------|------------|------------|
## 
## 
# los resultados en la diagonal principal
# muestran los aciertos del modelo solamente 
# tres de 17 para virginica resultaron errados
# Uso de boosting para mejorar el modelo
iris_model_boost10 <- C5.0(iris_train[-5],iris_train$Species,
                           trials = 10)
iris_model_boost10
## 
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species, trials = 10)
## 
## Classification Tree
## Number of samples: 100 
## Number of predictors: 4 
## 
## Number of boosting iterations: 10 
## Average tree size: 3.9 
## 
## Non-standard options: attempt to group attributes
summary(iris_model_boost10)
## 
## Call:
## C5.0.default(x = iris_train[-5], y = iris_train$Species, trials = 10)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 03 17:08:57 2016
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 100 cases (5 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (38)
## Petal.Length > 1.9:
## :...Petal.Width <= 1.7: versicolor (30/2)
##     Petal.Width > 1.7: virginica (32/1)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (28.8)
## Petal.Length > 1.9:
## :...Petal.Length <= 4.9: versicolor (31.6/2.3)
##     Petal.Length > 4.9: virginica (39.6/0.8)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (21.8)
## Petal.Length > 1.9:
## :...Petal.Length <= 4.7: versicolor (14.4)
##     Petal.Length > 4.7: virginica (63.8/14.5)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (17.3)
## Petal.Length > 1.9:
## :...Petal.Length <= 5: versicolor (65.6/22)
##     Petal.Length > 5: virginica (17.1)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (14.2)
## Petal.Length > 1.9:
## :...Petal.Width > 1.8: virginica (21.4)
##     Petal.Width <= 1.8:
##     :...Sepal.Length <= 5.9: versicolor (15.1)
##         Sepal.Length > 5.9:
##         :...Sepal.Length <= 6.4: virginica (33.5/5.6)
##             Sepal.Length > 6.4: versicolor (15.8/0.7)
## 
## -----  Trial 5:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (10.9)
## Petal.Length > 1.9:
## :...Petal.Length <= 4.7: versicolor (20)
##     Petal.Length > 4.7:
##     :...Petal.Length > 5: virginica (17.2)
##         Petal.Length <= 5:
##         :...Sepal.Length <= 6.2: virginica (32.5/7)
##             Sepal.Length > 6.2: versicolor (19.5/2.1)
## 
## -----  Trial 6:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (8.4)
## Petal.Length > 1.9:
## :...Petal.Width > 1.8: virginica (23.4)
##     Petal.Width <= 1.8:
##     :...Petal.Length > 5: virginica (8.4)
##         Petal.Length <= 5:
##         :...Sepal.Width <= 2.2: virginica (4.8/0.2)
##             Sepal.Width > 2.2: versicolor (55/9)
## 
## -----  Trial 7:  -----
## 
## Decision tree:
## 
## Sepal.Width > 3: versicolor (25.6/10)
## Sepal.Width <= 3:
## :...Petal.Width <= 1.7: versicolor (33.6/6.6)
##     Petal.Width > 1.7: virginica (40.8)
## 
## -----  Trial 8:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (18.1)
## Petal.Length > 1.9:
## :...Sepal.Length <= 5.9: versicolor (22.5/4.1)
##     Sepal.Length > 5.9:
##     :...Petal.Width <= 1.7: versicolor (21.9/6.1)
##         Petal.Width > 1.7: virginica (37.6)
## 
## -----  Trial 9:  -----
## 
## Decision tree:
## 
## Petal.Length <= 1.9: setosa (14.1)
## Petal.Length > 1.9:
## :...Petal.Width <= 1.4: versicolor (11.2)
##     Petal.Width > 1.4:
##     :...Petal.Width > 1.8: virginica (28.9)
##         Petal.Width <= 1.8:
##         :...Sepal.Length <= 5.9: versicolor (8.6)
##             Sepal.Length > 5.9: virginica (37.2/6.7)
## 
## 
## Evaluation on training data (100 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0      3    3( 3.0%)
##    1      3    4( 4.0%)
##    2      3    4( 4.0%)
##    3      3    6( 6.0%)
##    4      5    7( 7.0%)
##    5      5    2( 2.0%)
##    6      5    3( 3.0%)
##    7      3   55(55.0%)
##    8      4    6( 6.0%)
##    9      5    5( 5.0%)
## boost              0( 0.0%)   <<
## 
## 
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##      38                (a): class setosa
##            29          (b): class versicolor
##                  33    (c): class virginica
## 
## 
##  Attribute usage:
## 
##  100.00% Sepal.Width
##  100.00% Petal.Length
##   68.00% Petal.Width
##   62.00% Sepal.Length
## 
## 
## Time: 0.0 secs
iris_boost_predict <- predict(iris_model_boost10,
                              iris_test)
CrossTable(iris_test$Species,iris_boost_predict,
           prop.chisq = FALSE,prop.r = FALSE,
           prop.c=FALSE,
           dnn = c("Actual","Prediccion"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  50 
## 
##  
##              | Prediccion 
##       Actual |     setosa | versicolor |  virginica |  Row Total | 
## -------------|------------|------------|------------|------------|
##       setosa |         12 |          0 |          0 |         12 | 
##              |      0.240 |      0.000 |      0.000 |            | 
## -------------|------------|------------|------------|------------|
##   versicolor |          0 |         20 |          1 |         21 | 
##              |      0.000 |      0.400 |      0.020 |            | 
## -------------|------------|------------|------------|------------|
##    virginica |          0 |          1 |         16 |         17 | 
##              |      0.000 |      0.020 |      0.320 |            | 
## -------------|------------|------------|------------|------------|
## Column Total |         12 |         21 |         17 |         50 | 
## -------------|------------|------------|------------|------------|
## 
## 
# el modelo es levemente mejor que antes del boosting