Desarrollar regresión para evaluar la calidad del vino
Analizar y aplicar la técnica de regresión lineal en el conjunto de datos de vinos; realizar interpretaciones y de los modelos lineal y árbol de regresión para elaborar predicciones , comparaciones y establecer resultados de la calidad de los vinos.
library(rpart) # Arboles
library(rpart.plot) # Visualizar y represenar árboles
library(caret) # Para llevar a cabo particiones de conjuntos de datos en caso de...
library(dplyr) # Para select, filter, mutate, arange ....
library(readr) # Para leer datos
library(ggplot2) # Para grafica mas vistosas
library(reshape) # Para renombrar columnas
library(corrplot) # Para correlaciones visuales
datos <- read_csv("datos/winequality-red.csv")
## Parsed with column specification:
## cols(
## fixed_acidity = col_double(),
## volatile_acidity = col_double(),
## citric_acid = col_double(),
## residual_sugar = col_double(),
## chlorides = col_double(),
## free_sulfur_dioxide = col_double(),
## total_sulfur_dioxide = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_double()
## )
#datos_w <- read_csv2("datos/winequality-white.csv")
datos
## # A tibble: 1,599 x 12
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## 7 7.9 0.6 0.06 1.6 0.069
## 8 7.3 0.65 0 1.2 0.065
## 9 7.8 0.580 0.02 2 0.073
## 10 7.5 0.5 0.36 6.1 0.071
## # … with 1,589 more rows, and 7 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>
datos <- select(datos, quality, fixed_acidity, volatile_acidity, citric_acid, residual_sugar,chlorides,free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol)
head(datos)
## # A tibble: 6 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 7.4 0.7 0 1.9 0.076
## 2 5 7.8 0.88 0 2.6 0.098
## 3 5 7.8 0.76 0.04 2.3 0.092
## 4 6 11.2 0.28 0.56 1.9 0.075
## 5 5 7.4 0.7 0 1.9 0.076
## 6 5 7.4 0.66 0 1.8 0.075
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
tail(datos)
## # A tibble: 6 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 6.8 0.62 0.08 1.9 0.068
## 2 5 6.2 0.6 0.08 2 0.09
## 3 6 5.9 0.55 0.1 2.2 0.062
## 4 6 6.3 0.51 0.13 2.3 0.076
## 5 5 5.9 0.645 0.12 2 0.075
## 6 6 6 0.31 0.47 3.6 0.067
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
summary(datos)
## quality fixed_acidity volatile_acidity citric_acid
## Min. :3.000 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.:5.000 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median :6.000 Median : 7.90 Median :0.5200 Median :0.260
## Mean :5.636 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:6.000 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :8.000 Max. :15.90 Max. :1.5800 Max. :1.000
## residual_sugar chlorides free_sulfur_dioxide total_sulfur_dioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median : 2.200 Median :0.07900 Median :14.00 Median : 38.00
## Mean : 2.539 Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :15.500 Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1599 obs. of 12 variables:
## $ quality : num 5 5 5 6 5 5 5 7 7 5 ...
## $ fixed_acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile_acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric_acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual_sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free_sulfur_dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total_sulfur_dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## - attr(*, "spec")=
## .. cols(
## .. fixed_acidity = col_double(),
## .. volatile_acidity = col_double(),
## .. citric_acid = col_double(),
## .. residual_sugar = col_double(),
## .. chlorides = col_double(),
## .. free_sulfur_dioxide = col_double(),
## .. total_sulfur_dioxide = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
correlaciones <- cor(datos)
correlaciones
## quality fixed_acidity volatile_acidity citric_acid
## quality 1.00000000 0.12405165 -0.390557780 0.22637251
## fixed_acidity 0.12405165 1.00000000 -0.256130895 0.67170343
## volatile_acidity -0.39055778 -0.25613089 1.000000000 -0.55249568
## citric_acid 0.22637251 0.67170343 -0.552495685 1.00000000
## residual_sugar 0.01373164 0.11477672 0.001917882 0.14357716
## chlorides -0.12890656 0.09370519 0.061297772 0.20382291
## free_sulfur_dioxide -0.05065606 -0.15379419 -0.010503827 -0.06097813
## total_sulfur_dioxide -0.18510029 -0.11318144 0.076470005 0.03553302
## density -0.17491923 0.66804729 0.022026232 0.36494718
## pH -0.05773139 -0.68297819 0.234937294 -0.54190414
## sulphates 0.25139708 0.18300566 -0.260986685 0.31277004
## alcohol 0.47616632 -0.06166827 -0.202288027 0.10990325
## residual_sugar chlorides free_sulfur_dioxide
## quality 0.013731637 -0.128906560 -0.050656057
## fixed_acidity 0.114776724 0.093705186 -0.153794193
## volatile_acidity 0.001917882 0.061297772 -0.010503827
## citric_acid 0.143577162 0.203822914 -0.060978129
## residual_sugar 1.000000000 0.055609535 0.187048995
## chlorides 0.055609535 1.000000000 0.005562147
## free_sulfur_dioxide 0.187048995 0.005562147 1.000000000
## total_sulfur_dioxide 0.203027882 0.047400468 0.667666450
## density 0.355283371 0.200632327 -0.021945831
## pH -0.085652422 -0.265026131 0.070377499
## sulphates 0.005527121 0.371260481 0.051657572
## alcohol 0.042075437 -0.221140545 -0.069408354
## total_sulfur_dioxide density pH sulphates
## quality -0.18510029 -0.17491923 -0.05773139 0.251397079
## fixed_acidity -0.11318144 0.66804729 -0.68297819 0.183005664
## volatile_acidity 0.07647000 0.02202623 0.23493729 -0.260986685
## citric_acid 0.03553302 0.36494718 -0.54190414 0.312770044
## residual_sugar 0.20302788 0.35528337 -0.08565242 0.005527121
## chlorides 0.04740047 0.20063233 -0.26502613 0.371260481
## free_sulfur_dioxide 0.66766645 -0.02194583 0.07037750 0.051657572
## total_sulfur_dioxide 1.00000000 0.07126948 -0.06649456 0.042946836
## density 0.07126948 1.00000000 -0.34169933 0.148506412
## pH -0.06649456 -0.34169933 1.00000000 -0.196647602
## sulphates 0.04294684 0.14850641 -0.19664760 1.000000000
## alcohol -0.20565394 -0.49617977 0.20563251 0.093594750
## alcohol
## quality 0.47616632
## fixed_acidity -0.06166827
## volatile_acidity -0.20228803
## citric_acid 0.10990325
## residual_sugar 0.04207544
## chlorides -0.22114054
## free_sulfur_dioxide -0.06940835
## total_sulfur_dioxide -0.20565394
## density -0.49617977
## pH 0.20563251
## sulphates 0.09359475
## alcohol 1.00000000
Gráfica de correlaciones:
set.seed(2020) # Semilla
entrena <- createDataPartition(datos$quality, p=0.7, list = FALSE)
head(entrena)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 6
nrow(entrena)
## [1] 1120
head(datos[-entrena,])
## # A tibble: 6 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 7.9 0.6 0.06 1.6 0.069
## 2 7 7.3 0.65 0 1.2 0.065
## 3 5 7.5 0.5 0.36 6.1 0.071
## 4 7 8.5 0.28 0.56 1.8 0.092
## 5 5 8.1 0.56 0.28 1.7 0.368
## 6 5 7.9 0.43 0.21 1.6 0.106
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
head(datos)
## # A tibble: 6 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 7.4 0.7 0 1.9 0.076
## 2 5 7.8 0.88 0 2.6 0.098
## 3 5 7.8 0.76 0.04 2.3 0.092
## 4 6 11.2 0.28 0.56 1.9 0.075
## 5 5 7.4 0.7 0 1.9 0.076
## 6 5 7.4 0.66 0 1.8 0.075
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
datos.Entrena <- datos[entrena,]
head(datos.Entrena)
## # A tibble: 6 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 7.4 0.7 0 1.9 0.076
## 2 5 7.8 0.88 0 2.6 0.098
## 3 5 7.8 0.76 0.04 2.3 0.092
## 4 6 11.2 0.28 0.56 1.9 0.075
## 5 5 7.4 0.7 0 1.9 0.076
## 6 5 7.4 0.66 0 1.8 0.075
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
summary(datos.Entrena)
## quality fixed_acidity volatile_acidity citric_acid
## Min. :3.000 Min. : 4.70 Min. :0.1200 Min. :0.0000
## 1st Qu.:5.000 1st Qu.: 7.10 1st Qu.:0.4000 1st Qu.:0.0975
## Median :6.000 Median : 7.90 Median :0.5300 Median :0.2500
## Mean :5.635 Mean : 8.34 Mean :0.5326 Mean :0.2691
## 3rd Qu.:6.000 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.4300
## Max. :8.000 Max. :15.90 Max. :1.5800 Max. :0.7900
## residual_sugar chlorides free_sulfur_dioxide total_sulfur_dioxide
## Min. : 0.900 Min. :0.03400 Min. : 1.00 Min. : 6.00
## 1st Qu.: 1.900 1st Qu.:0.07100 1st Qu.: 8.00 1st Qu.: 22.00
## Median : 2.200 Median :0.08000 Median :14.00 Median : 38.00
## Mean : 2.554 Mean :0.08693 Mean :16.13 Mean : 46.82
## 3rd Qu.: 2.600 3rd Qu.:0.09025 3rd Qu.:22.00 3rd Qu.: 62.00
## Max. :15.500 Max. :0.46700 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.860 Min. :0.3700 Min. : 8.4
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.5
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.1
## Mean :0.9968 Mean :3.311 Mean :0.6588 Mean :10.4
## 3rd Qu.:0.9979 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.1
## Max. :1.0037 Max. :4.010 Max. :1.9800 Max. :14.9
# y conjunto de datos de validación y luego head()
datos.Valida <- datos[-entrena,]
head(datos.Valida)
## # A tibble: 6 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 7.9 0.6 0.06 1.6 0.069
## 2 7 7.3 0.65 0 1.2 0.065
## 3 5 7.5 0.5 0.36 6.1 0.071
## 4 7 8.5 0.28 0.56 1.8 0.092
## 5 5 8.1 0.56 0.28 1.7 0.368
## 6 5 7.9 0.43 0.21 1.6 0.106
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
modelo <- lm(quality ~ ., datos.Entrena)
#Guardar en variable este modelo
modelo
##
## Call:
## lm(formula = quality ~ ., data = datos.Entrena)
##
## Coefficients:
## (Intercept) fixed_acidity volatile_acidity
## 20.747534 0.008274 -0.922244
## citric_acid residual_sugar chlorides
## -0.019610 0.011741 -1.804597
## free_sulfur_dioxide total_sulfur_dioxide density
## 0.006650 -0.003765 -16.312818
## pH sulphates alcohol
## -0.573518 0.928338 0.294048
summary(modelo)
##
## Call:
## lm(formula = quality ~ ., data = datos.Entrena)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.70643 -0.36046 -0.04914 0.45944 1.98343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.075e+01 2.562e+01 0.810 0.418266
## fixed_acidity 8.274e-03 3.111e-02 0.266 0.790320
## volatile_acidity -9.222e-01 1.446e-01 -6.378 2.63e-10 ***
## citric_acid -1.961e-02 1.793e-01 -0.109 0.912913
## residual_sugar 1.174e-02 1.716e-02 0.684 0.493948
## chlorides -1.805e+00 5.417e-01 -3.331 0.000893 ***
## free_sulfur_dioxide 6.650e-03 2.652e-03 2.508 0.012300 *
## total_sulfur_dioxide -3.765e-03 8.681e-04 -4.337 1.57e-05 ***
## density -1.631e+01 2.613e+01 -0.624 0.532544
## pH -5.735e-01 2.276e-01 -2.520 0.011878 *
## sulphates 9.283e-01 1.360e-01 6.824 1.46e-11 ***
## alcohol 2.940e-01 3.192e-02 9.212 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6611 on 1108 degrees of freedom
## Multiple R-squared: 0.3558, Adjusted R-squared: 0.3494
## F-statistic: 55.64 on 11 and 1108 DF, p-value: < 2.2e-16
predecir <- predict(modelo, newdata = datos.Valida )
datos.Valida[1,]
## # A tibble: 1 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 7.9 0.6 0.06 1.6 0.069
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
datos.Valida[nrow(datos.Valida),]
## # A tibble: 1 x 12
## quality fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 6 0.31 0.47 3.6 0.067
## # … with 6 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>
predecir[1]
## 1
## 5.074639
fixed_acidity=8
volatile_acidity= .6
citric_acid= .1
residual_sugar = 2
chlorides = .09
free_sulfur_dioxide = 12
total_sulfur_dioxide = 50
density = .996
pH = 3.3
sulphates = .55
alcohol = 10
nuevo.Dato <- data.frame(fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, sulphates, alcohol)
nuevo.Dato
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## 1 8 0.6 0.1 2 0.09
## free_sulfur_dioxide total_sulfur_dioxide density pH sulphates alcohol
## 1 12 50 0.996 3.3 0.55 10
predecir <- predict(modelo, newdata = nuevo.Dato)
predecir
## 1
## 5.321922