library(rpart)      # Arboles
## Warning: package 'rpart' was built under R version 3.6.3
library(rpart.plot) # Visualizar y represenar árboles
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(caret)      # Para llevar a cabo particiones de conjuntos de datos en caso de...
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
library(dplyr)      # Para select, filter, mutate, arange ....
## Warning: package 'dplyr' was built under R version 3.6.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)      # Para leer datos
library(ggplot2)    # Para grafica mas vistosas
library(reshape)
## Warning: package 'reshape' was built under R version 3.6.3
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
datos <- read_csv("~/tabajos diplomado/modulo 5/Git/FundaMachineLearning/datos/winequality-red.csv")
## Parsed with column specification:
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_double()
## )
head(datos)
## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.4             0.7           0                 1.9     0.076
## 2             7.8             0.88          0                 2.6     0.098
## 3             7.8             0.76          0.04              2.3     0.092
## 4            11.2             0.28          0.56              1.9     0.075
## 5             7.4             0.7           0                 1.9     0.076
## 6             7.4             0.66          0                 1.8     0.075
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
tail(datos)
## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             6.8            0.62           0.08              1.9     0.068
## 2             6.2            0.6            0.08              2       0.09 
## 3             5.9            0.55           0.1               2.2     0.062
## 4             6.3            0.51           0.13              2.3     0.076
## 5             5.9            0.645          0.12              2       0.075
## 6             6              0.31           0.47              3.6     0.067
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
summary(datos)
##  fixed acidity   volatile acidity  citric acid    residual sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1599 obs. of  12 variables:
##  $ fixed acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free sulfur dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total sulfur dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : num  5 5 5 6 5 5 5 7 7 5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `fixed acidity` = col_double(),
##   ..   `volatile acidity` = col_double(),
##   ..   `citric acid` = col_double(),
##   ..   `residual sugar` = col_double(),
##   ..   chlorides = col_double(),
##   ..   `free sulfur dioxide` = col_double(),
##   ..   `total sulfur dioxide` = col_double(),
##   ..   density = col_double(),
##   ..   pH = col_double(),
##   ..   sulphates = col_double(),
##   ..   alcohol = col_double(),
##   ..   quality = col_double()
##   .. )
set.seed(2020) # Semilla
entrena <- createDataPartition(datos$quality, p=0.7, list = FALSE)
head(entrena)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         4
## [5,]         5
## [6,]         6
nrow(entrena)
## [1] 1120
head(datos[-entrena,])
## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.9             0.6           0.06              1.6     0.069
## 2             7.3             0.65          0                 1.2     0.065
## 3             7.5             0.5           0.36              6.1     0.071
## 4             8.5             0.28          0.56              1.8     0.092
## 5             8.1             0.56          0.28              1.7     0.368
## 6             7.9             0.43          0.21              1.6     0.106
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
nrow(datos[-entrena,])
## [1] 479
head(datos)
## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.4             0.7           0                 1.9     0.076
## 2             7.8             0.88          0                 2.6     0.098
## 3             7.8             0.76          0.04              2.3     0.092
## 4            11.2             0.28          0.56              1.9     0.075
## 5             7.4             0.7           0                 1.9     0.076
## 6             7.4             0.66          0                 1.8     0.075
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
datos.Entrena <- datos[entrena,]
head(datos.Entrena)
## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.4             0.7           0                 1.9     0.076
## 2             7.8             0.88          0                 2.6     0.098
## 3             7.8             0.76          0.04              2.3     0.092
## 4            11.2             0.28          0.56              1.9     0.075
## 5             7.4             0.7           0                 1.9     0.076
## 6             7.4             0.66          0                 1.8     0.075
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
summary(datos.Entrena)
##  fixed acidity   volatile acidity  citric acid     residual sugar  
##  Min.   : 4.70   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.4000   1st Qu.:0.0975   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5300   Median :0.2500   Median : 2.200  
##  Mean   : 8.34   Mean   :0.5326   Mean   :0.2691   Mean   : 2.554  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.4300   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :0.7900   Max.   :15.500  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.03400   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07100   1st Qu.: 8.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.08000   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08693   Mean   :16.13       Mean   : 46.82       Mean   :0.9968  
##  3rd Qu.:0.09025   3rd Qu.:22.00       3rd Qu.: 62.00       3rd Qu.:0.9979  
##  Max.   :0.46700   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol        quality     
##  Min.   :2.860   Min.   :0.3700   Min.   : 8.4   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.5   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.1   Median :6.000  
##  Mean   :3.311   Mean   :0.6588   Mean   :10.4   Mean   :5.635  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.1   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :1.9800   Max.   :14.9   Max.   :8.000
datos.Valida <- datos[-entrena,]
head(datos.Valida)
## # A tibble: 6 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.9             0.6           0.06              1.6     0.069
## 2             7.3             0.65          0                 1.2     0.065
## 3             7.5             0.5           0.36              6.1     0.071
## 4             8.5             0.28          0.56              1.8     0.092
## 5             8.1             0.56          0.28              1.7     0.368
## 6             7.9             0.43          0.21              1.6     0.106
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
summary(datos.Valida)
##  fixed acidity    volatile acidity  citric acid     residual sugar 
##  Min.   : 4.600   Min.   :0.1200   Min.   :0.0000   Min.   :1.200  
##  1st Qu.: 7.100   1st Qu.:0.3900   1st Qu.:0.0900   1st Qu.:1.900  
##  Median : 7.900   Median :0.5000   Median :0.2800   Median :2.200  
##  Mean   : 8.272   Mean   :0.5166   Mean   :0.2753   Mean   :2.503  
##  3rd Qu.: 9.200   3rd Qu.:0.6300   3rd Qu.:0.4200   3rd Qu.:2.600  
##  Max.   :15.600   Max.   :1.1850   Max.   :1.0000   Max.   :9.000  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9902  
##  1st Qu.:0.06800   1st Qu.: 7.00       1st Qu.: 21.50       1st Qu.:0.9956  
##  Median :0.07800   Median :12.00       Median : 36.00       Median :0.9967  
##  Mean   :0.08872   Mean   :15.29       Mean   : 45.64       Mean   :0.9966  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 63.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :68.00       Max.   :155.00       Max.   :1.0031  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 9.00   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.30   Median :6.000  
##  Mean   :3.312   Mean   :0.6566   Mean   :10.47   Mean   :5.639  
##  3rd Qu.:3.410   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :3.900   Max.   :2.0000   Max.   :14.00   Max.   :8.000
arbol <- rpart(formula = quality  ~ ., data = datos.Entrena)
arbol
## n= 1120 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1120 751.64200 5.634821  
##    2) alcohol< 10.525 693 314.90330 5.366522  
##      4) volatile acidity>=0.345 621 253.08530 5.302738  
##        8) sulphates< 0.535 155  35.97419 5.012903 *
##        9) sulphates>=0.535 466 199.75970 5.399142  
##         18) alcohol< 9.85 297 100.24240 5.282828 *
##         19) alcohol>=9.85 169  88.43787 5.603550 *
##      5) volatile acidity< 0.345 72  37.50000 5.916667 *
##    3) alcohol>=10.525 427 305.89230 6.070258  
##      6) sulphates< 0.625 157 101.73250 5.649682  
##       12) volatile acidity>=1.015 8   4.87500 4.125000 *
##       13) volatile acidity< 1.015 149  77.26174 5.731544  
##         26) alcohol< 11.65 94  47.48936 5.510638  
##           52) free sulfur dioxide< 8.5 36  15.55556 5.111111 *
##           53) free sulfur dioxide>=8.5 58  22.62069 5.758621 *
##         27) alcohol>=11.65 55  17.34545 6.109091 *
##      7) sulphates>=0.625 270 160.24070 6.314815  
##       14) alcohol< 11.55 157  84.36943 6.101911  
##         28) volatile acidity>=0.395 84  32.03571 5.892857 *
##         29) volatile acidity< 0.395 73  44.43836 6.342466  
##           58) pH>=3.25 45  24.80000 6.066667 *
##           59) pH< 3.25 28  10.71429 6.785714 *
##       15) alcohol>=11.55 113  58.86726 6.610619 *
prp(arbol, type = 2, nn = TRUE, 
    fallen.leaves = TRUE, faclen = 4,
    varlen = 8,  shadow.col = "gray")

arbol$cptable
##            CP nsplit rel error    xerror       xstd
## 1  0.17408072      0 1.0000000 1.0005542 0.04657576
## 2  0.05843081      1 0.8259193 0.8570500 0.04479501
## 3  0.03235313      2 0.7674885 0.7920724 0.03916559
## 4  0.02607058      3 0.7351353 0.7756639 0.03890049
## 5  0.02308479      4 0.7090648 0.7551788 0.03815786
## 6  0.02262255      5 0.6859800 0.7440868 0.03767959
## 7  0.01653304      6 0.6633574 0.7111984 0.03645750
## 8  0.01474021      7 0.6468244 0.6966805 0.03590473
## 9  0.01239036      8 0.6320842 0.6877223 0.03508585
## 10 0.01118846      9 0.6196938 0.6926970 0.03504693
## 11 0.01000000     11 0.5973169 0.6787878 0.03469302
plotcp(arbol)

prediccion.quality <- predict(arbol, newdata = datos.Valida)

datos.Valida[1,]
## # A tibble: 1 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.9              0.6          0.06              1.6     0.069
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
prediccion.quality[1]
##        1 
## 5.012903
`fixed acidity` <- 7.0
`volatile acidity` <- .68
`citric acid` <- .16
`residual sugar` <- 5
`chlorides` <- .055
`free sulfur dioxide` <- 15
`total sulfur dioxide` <- 60
`density` <- .95
`pH` <- 4
`sulphates` <- .6 
`alcohol` <- 8.5

nuevo.dato <- data.frame(`fixed acidity`, `volatile acidity`, `citric acid`, `residual sugar`, `chlorides`, `free sulfur dioxide`,`total sulfur dioxide`,`density`,`pH`,`sulphates`,`alcohol`)
nuevo.dato
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1             7             0.68        0.16              5     0.055
##   free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1                  15                   60    0.95  4       0.6     8.5
prediccion.quality <- predict(arbol, newdata = nuevo.dato)

prediccion.quality
##        1 
## 5.282828
datos.Valida[1,]
## # A tibble: 1 x 12
##   `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
##             <dbl>            <dbl>         <dbl>            <dbl>     <dbl>
## 1             7.9              0.6          0.06              1.6     0.069
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## #   dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## #   quality <dbl>
prediccion.quality[1]
##        1 
## 5.282828