library(rpart) # Arboles
## Warning: package 'rpart' was built under R version 3.6.3
library(rpart.plot) # Visualizar y represenar árboles
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(caret) # Para llevar a cabo particiones de conjuntos de datos en caso de...
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
library(dplyr) # Para select, filter, mutate, arange ....
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr) # Para leer datos
library(ggplot2) # Para grafica mas vistosas
library(reshape)
## Warning: package 'reshape' was built under R version 3.6.3
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
datos <- read_csv("~/tabajos diplomado/modulo 5/Git/FundaMachineLearning/datos/winequality-red.csv")
## Parsed with column specification:
## cols(
## `fixed acidity` = col_double(),
## `volatile acidity` = col_double(),
## `citric acid` = col_double(),
## `residual sugar` = col_double(),
## chlorides = col_double(),
## `free sulfur dioxide` = col_double(),
## `total sulfur dioxide` = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_double()
## )
head(datos)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
tail(datos)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.8 0.62 0.08 1.9 0.068
## 2 6.2 0.6 0.08 2 0.09
## 3 5.9 0.55 0.1 2.2 0.062
## 4 6.3 0.51 0.13 2.3 0.076
## 5 5.9 0.645 0.12 2 0.075
## 6 6 0.31 0.47 3.6 0.067
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
summary(datos)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1599 obs. of 12 variables:
## $ fixed acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free sulfur dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total sulfur dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num 5 5 5 6 5 5 5 7 7 5 ...
## - attr(*, "spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
set.seed(2020) # Semilla
entrena <- createDataPartition(datos$quality, p=0.7, list = FALSE)
head(entrena)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 6
nrow(entrena)
## [1] 1120
head(datos[-entrena,])
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.9 0.6 0.06 1.6 0.069
## 2 7.3 0.65 0 1.2 0.065
## 3 7.5 0.5 0.36 6.1 0.071
## 4 8.5 0.28 0.56 1.8 0.092
## 5 8.1 0.56 0.28 1.7 0.368
## 6 7.9 0.43 0.21 1.6 0.106
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
nrow(datos[-entrena,])
## [1] 479
head(datos)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
datos.Entrena <- datos[entrena,]
head(datos.Entrena)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
summary(datos.Entrena)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.70 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.4000 1st Qu.:0.0975 1st Qu.: 1.900
## Median : 7.90 Median :0.5300 Median :0.2500 Median : 2.200
## Mean : 8.34 Mean :0.5326 Mean :0.2691 Mean : 2.554
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.4300 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :0.7900 Max. :15.500
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.03400 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07100 1st Qu.: 8.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.08000 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08693 Mean :16.13 Mean : 46.82 Mean :0.9968
## 3rd Qu.:0.09025 3rd Qu.:22.00 3rd Qu.: 62.00 3rd Qu.:0.9979
## Max. :0.46700 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.860 Min. :0.3700 Min. : 8.4 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.5 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.1 Median :6.000
## Mean :3.311 Mean :0.6588 Mean :10.4 Mean :5.635
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.1 3rd Qu.:6.000
## Max. :4.010 Max. :1.9800 Max. :14.9 Max. :8.000
datos.Valida <- datos[-entrena,]
head(datos.Valida)
## # A tibble: 6 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.9 0.6 0.06 1.6 0.069
## 2 7.3 0.65 0 1.2 0.065
## 3 7.5 0.5 0.36 6.1 0.071
## 4 8.5 0.28 0.56 1.8 0.092
## 5 8.1 0.56 0.28 1.7 0.368
## 6 7.9 0.43 0.21 1.6 0.106
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
summary(datos.Valida)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.600 Min. :0.1200 Min. :0.0000 Min. :1.200
## 1st Qu.: 7.100 1st Qu.:0.3900 1st Qu.:0.0900 1st Qu.:1.900
## Median : 7.900 Median :0.5000 Median :0.2800 Median :2.200
## Mean : 8.272 Mean :0.5166 Mean :0.2753 Mean :2.503
## 3rd Qu.: 9.200 3rd Qu.:0.6300 3rd Qu.:0.4200 3rd Qu.:2.600
## Max. :15.600 Max. :1.1850 Max. :1.0000 Max. :9.000
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9902
## 1st Qu.:0.06800 1st Qu.: 7.00 1st Qu.: 21.50 1st Qu.:0.9956
## Median :0.07800 Median :12.00 Median : 36.00 Median :0.9967
## Mean :0.08872 Mean :15.29 Mean : 45.64 Mean :0.9966
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 63.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :68.00 Max. :155.00 Max. :1.0031
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 9.00 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.30 Median :6.000
## Mean :3.312 Mean :0.6566 Mean :10.47 Mean :5.639
## 3rd Qu.:3.410 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :3.900 Max. :2.0000 Max. :14.00 Max. :8.000
arbol <- rpart(formula = quality ~ ., data = datos.Entrena)
arbol
## n= 1120
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1120 751.64200 5.634821
## 2) alcohol< 10.525 693 314.90330 5.366522
## 4) volatile acidity>=0.345 621 253.08530 5.302738
## 8) sulphates< 0.535 155 35.97419 5.012903 *
## 9) sulphates>=0.535 466 199.75970 5.399142
## 18) alcohol< 9.85 297 100.24240 5.282828 *
## 19) alcohol>=9.85 169 88.43787 5.603550 *
## 5) volatile acidity< 0.345 72 37.50000 5.916667 *
## 3) alcohol>=10.525 427 305.89230 6.070258
## 6) sulphates< 0.625 157 101.73250 5.649682
## 12) volatile acidity>=1.015 8 4.87500 4.125000 *
## 13) volatile acidity< 1.015 149 77.26174 5.731544
## 26) alcohol< 11.65 94 47.48936 5.510638
## 52) free sulfur dioxide< 8.5 36 15.55556 5.111111 *
## 53) free sulfur dioxide>=8.5 58 22.62069 5.758621 *
## 27) alcohol>=11.65 55 17.34545 6.109091 *
## 7) sulphates>=0.625 270 160.24070 6.314815
## 14) alcohol< 11.55 157 84.36943 6.101911
## 28) volatile acidity>=0.395 84 32.03571 5.892857 *
## 29) volatile acidity< 0.395 73 44.43836 6.342466
## 58) pH>=3.25 45 24.80000 6.066667 *
## 59) pH< 3.25 28 10.71429 6.785714 *
## 15) alcohol>=11.55 113 58.86726 6.610619 *
prp(arbol, type = 2, nn = TRUE,
fallen.leaves = TRUE, faclen = 4,
varlen = 8, shadow.col = "gray")

arbol$cptable
## CP nsplit rel error xerror xstd
## 1 0.17408072 0 1.0000000 1.0005542 0.04657576
## 2 0.05843081 1 0.8259193 0.8570500 0.04479501
## 3 0.03235313 2 0.7674885 0.7920724 0.03916559
## 4 0.02607058 3 0.7351353 0.7756639 0.03890049
## 5 0.02308479 4 0.7090648 0.7551788 0.03815786
## 6 0.02262255 5 0.6859800 0.7440868 0.03767959
## 7 0.01653304 6 0.6633574 0.7111984 0.03645750
## 8 0.01474021 7 0.6468244 0.6966805 0.03590473
## 9 0.01239036 8 0.6320842 0.6877223 0.03508585
## 10 0.01118846 9 0.6196938 0.6926970 0.03504693
## 11 0.01000000 11 0.5973169 0.6787878 0.03469302
plotcp(arbol)

prediccion.quality <- predict(arbol, newdata = datos.Valida)
datos.Valida[1,]
## # A tibble: 1 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.9 0.6 0.06 1.6 0.069
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
prediccion.quality[1]
## 1
## 5.012903
`fixed acidity` <- 7.0
`volatile acidity` <- .68
`citric acid` <- .16
`residual sugar` <- 5
`chlorides` <- .055
`free sulfur dioxide` <- 15
`total sulfur dioxide` <- 60
`density` <- .95
`pH` <- 4
`sulphates` <- .6
`alcohol` <- 8.5
nuevo.dato <- data.frame(`fixed acidity`, `volatile acidity`, `citric acid`, `residual sugar`, `chlorides`, `free sulfur dioxide`,`total sulfur dioxide`,`density`,`pH`,`sulphates`,`alcohol`)
nuevo.dato
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7 0.68 0.16 5 0.055
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 15 60 0.95 4 0.6 8.5
prediccion.quality <- predict(arbol, newdata = nuevo.dato)
prediccion.quality
## 1
## 5.282828
datos.Valida[1,]
## # A tibble: 1 x 12
## `fixed acidity` `volatile acidi… `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.9 0.6 0.06 1.6 0.069
## # … with 7 more variables: `free sulfur dioxide` <dbl>, `total sulfur
## # dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>,
## # quality <dbl>
prediccion.quality[1]
## 1
## 5.282828