Además, los resultados muestran que la calidad del vino está
influenciada principalmente por el contenido de alcohol y la acidez
volátil. El análisis de correlación mostró que el alcohol presenta la
asociación positiva más alta con la calidad (r=0.476), mientras que la
acidez volátil registra la relación negativa más fuerte (r=−0.391).
Estos resultados sugieren que los vinos con mayor graduación alcohólica
y menores niveles de acidez volátil tienden a obtener mejores
calificaciones de calidad. Por eso, estas variables constituyen factores
críticos para futuras estrategias de mejora y modelamiento
predictivo.
library(ggplot2)
library(pastecs)
library(car)
## Cargando paquete requerido: carData
library(gridExtra)
library(corrplot)
## corrplot 0.95 loaded
# Se carga el archivo csv para realizar el análisis
library(readr)
dat <- read_csv("winequality-red.csv")
## Rows: 1599 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): fixed acidity, volatile acidity, citric acid, residual sugar, chlo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(dat)
head(dat, 6)
## # A tibble: 6 × 12
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.4 0.7 0 1.9 0.076
## 2 7.8 0.88 0 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.7 0 1.9 0.076
## 6 7.4 0.66 0 1.8 0.075
## # ℹ 7 more variables: `free sulfur dioxide` <dbl>,
## # `total sulfur dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>
tail(dat, 6)
## # A tibble: 6 × 12
## `fixed acidity` `volatile acidity` `citric acid` `residual sugar` chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.8 0.62 0.08 1.9 0.068
## 2 6.2 0.6 0.08 2 0.09
## 3 5.9 0.55 0.1 2.2 0.062
## 4 6.3 0.51 0.13 2.3 0.076
## 5 5.9 0.645 0.12 2 0.075
## 6 6 0.31 0.47 3.6 0.067
## # ℹ 7 more variables: `free sulfur dioxide` <dbl>,
## # `total sulfur dioxide` <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>
str(dat)
## spc_tbl_ [1,599 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fixed acidity : num [1:1599] 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile acidity : num [1:1599] 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric acid : num [1:1599] 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual sugar : num [1:1599] 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num [1:1599] 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free sulfur dioxide : num [1:1599] 11 25 15 17 11 13 15 15 9 17 ...
## $ total sulfur dioxide: num [1:1599] 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num [1:1599] 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num [1:1599] 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num [1:1599] 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num [1:1599] 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : num [1:1599] 5 5 5 6 5 5 5 7 7 5 ...
## - attr(*, "spec")=
## .. cols(
## .. `fixed acidity` = col_double(),
## .. `volatile acidity` = col_double(),
## .. `citric acid` = col_double(),
## .. `residual sugar` = col_double(),
## .. chlorides = col_double(),
## .. `free sulfur dioxide` = col_double(),
## .. `total sulfur dioxide` = col_double(),
## .. density = col_double(),
## .. pH = col_double(),
## .. sulphates = col_double(),
## .. alcohol = col_double(),
## .. quality = col_double()
## .. )
## - attr(*, "problems")=<pointer: 0x00000243eb6ff8a0>
names(dat)
## [1] "fixed acidity" "volatile acidity" "citric acid"
## [4] "residual sugar" "chlorides" "free sulfur dioxide"
## [7] "total sulfur dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
nrow(dat)
## [1] 1599
ncol(dat)
## [1] 12
dim(dat)
## [1] 1599 12
cat("Valores faltantes:", sum(is.na(dat)), "\n")
## Valores faltantes: 0
# Se crea una función para analizar cada variable
analizar_variable <- function(variable, nombre_var, unidad = "") {
cat(rep("=", 60), "\n")
cat(" VARIABLE:", nombre_var, "\n")
cat(rep("=", 60), "\n\n")
# Moda
tab <- table(variable)
moda <- as.numeric(names(tab)[which.max(tab)])
# Tendencia central
cat("MEDIDAS DE TENDENCIA CENTRAL \n")
cat(" Media: ", round(mean(variable, na.rm = TRUE), 4), unidad, "\n")
cat(" Mediana: ", round(median(variable, na.rm = TRUE), 4), unidad, "\n")
cat(" Moda: ", moda, unidad, "\n\n")
# Dispersión
cat("MEDIDAS DE DISPERSIÓN \n")
cat(" Mínimo: ", round(min(variable, na.rm = TRUE), 4), unidad, "\n")
cat(" Máximo: ", round(max(variable, na.rm = TRUE), 4), unidad, "\n")
cat(" Rango: ", round(max(variable, na.rm = TRUE) - min(variable, na.rm = TRUE), 4), unidad, "\n")
cat(" Q1 (25%): ", round(quantile(variable, 0.25, na.rm = TRUE), 4), unidad, "\n")
cat(" Q3 (75%): ", round(quantile(variable, 0.75, na.rm = TRUE), 4), unidad, "\n")
cat(" IQR: ", round(IQR(variable, na.rm = TRUE), 4), unidad, "\n")
cat(" Varianza: ", round(var(variable, na.rm = TRUE), 4), "\n")
cat(" Desv. Est.: ", round(sd(variable, na.rm = TRUE), 4), unidad, "\n")
cv <- sd(variable, na.rm = TRUE) / mean(variable, na.rm = TRUE) * 100
cat(" Coef. Var.: ", round(cv, 2), "%\n\n")
# Forma distribución
desc <- stat.desc(variable, norm = TRUE)
cat("FORMA DE LA DISTRIBUCIÓN \n")
cat(" Asimetría (skewness):", round(desc["skewness"], 3), "\n")
cat(" Curtosis (kurtosis):", round(desc["kurtosis"], 3), "\n\n")
# Percentiles
cat("PERCENTILES \n")
percs <- quantile(variable, probs = c(0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95), na.rm = TRUE)
print(round(percs, 4))
cat("\n")
}
# Se crea una función para visualizar los gráficos para cada variable
graficar_variable <- function(variable, nombre_var, color_hex) {
df_tmp <- data.frame(x = variable)
p1 <- ggplot(df_tmp, aes(x = x)) +
geom_histogram(bins = 30, fill = color_hex, color = "white", alpha = 0.85) +
labs(title = paste("Histograma:", nombre_var),
x = nombre_var, y = "Frecuencia") +
theme_minimal()
p2 <- ggplot(df_tmp, aes(y = x)) +
geom_boxplot(fill = color_hex, alpha = 0.7, color = "gray20") +
labs(title = paste("Boxplot:", nombre_var), y = nombre_var) +
theme_minimal()
p3 <- ggplot(df_tmp, aes(x = x)) +
geom_density(fill = color_hex, alpha = 0.5) +
labs(title = paste("Densidad:", nombre_var),
x = nombre_var, y = "Densidad") +
theme_minimal()
p4 <- ggplot(df_tmp, aes(sample = x)) +
stat_qq(color = color_hex) +
stat_qq_line(color = "black") +
labs(title = paste("QQ-Plot:", nombre_var)) +
theme_minimal()
print(grid.arrange(p1, p2, p3, p4, ncol = 2))
}
# Análisis de cada variable
# ── fixed acidity ─────────────────────────────────────────────────────────────
analizar_variable(dat$`fixed acidity`, "fixed acidity", "g/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: fixed acidity
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 8.3196 g/L
## Mediana: 7.9 g/L
## Moda: 7.2 g/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 4.6 g/L
## Máximo: 15.9 g/L
## Rango: 11.3 g/L
## Q1 (25%): 7.1 g/L
## Q3 (75%): 9.2 g/L
## IQR: 2.1 g/L
## Varianza: 3.0314
## Desv. Est.: 1.7411 g/L
## Coef. Var.: 20.93 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 0.981
## Curtosis (kurtosis): 1.12
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 6.1 6.5 7.1 7.9 9.2 10.7 11.8
graficar_variable(dat$`fixed acidity`, "fixed acidity", "#C0392B")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── volatile acidity ──────────────────────────────────────────────────────────
analizar_variable(dat$`volatile acidity`, "volatile acidity", "g/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: volatile acidity
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 0.5278 g/L
## Mediana: 0.52 g/L
## Moda: 0.6 g/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 0.12 g/L
## Máximo: 1.58 g/L
## Rango: 1.46 g/L
## Q1 (25%): 0.39 g/L
## Q3 (75%): 0.64 g/L
## IQR: 0.25 g/L
## Varianza: 0.0321
## Desv. Est.: 0.1791 g/L
## Coef. Var.: 33.92 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 0.67
## Curtosis (kurtosis): 1.213
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 0.270 0.310 0.390 0.520 0.640 0.745 0.840
graficar_variable(dat$`volatile acidity`, "volatile acidity", "#E67E22")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── citric acid ───────────────────────────────────────────────────────────────
analizar_variable(dat$`citric acid`, "citric acid", "g/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: citric acid
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 0.271 g/L
## Mediana: 0.26 g/L
## Moda: 0 g/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 0 g/L
## Máximo: 1 g/L
## Rango: 1 g/L
## Q1 (25%): 0.09 g/L
## Q3 (75%): 0.42 g/L
## IQR: 0.33 g/L
## Varianza: 0.0379
## Desv. Est.: 0.1948 g/L
## Coef. Var.: 71.89 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 0.318
## Curtosis (kurtosis): -0.793
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 0.000 0.010 0.090 0.260 0.420 0.522 0.600
graficar_variable(dat$`citric acid`, "citric acid", "#F1C40F")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── residual sugar ────────────────────────────────────────────────────────────
analizar_variable(dat$`residual sugar`, "residual sugar", "g/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: residual sugar
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 2.5388 g/L
## Mediana: 2.2 g/L
## Moda: 2 g/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 0.9 g/L
## Máximo: 15.5 g/L
## Rango: 14.6 g/L
## Q1 (25%): 1.9 g/L
## Q3 (75%): 2.6 g/L
## IQR: 0.7 g/L
## Varianza: 1.9879
## Desv. Est.: 1.4099 g/L
## Coef. Var.: 55.54 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 4.532
## Curtosis (kurtosis): 28.485
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 1.59 1.70 1.90 2.20 2.60 3.60 5.10
graficar_variable(dat$`residual sugar`, "residual sugar", "#27AE60")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── chlorides ─────────────────────────────────────────────────────────────────
analizar_variable(dat$chlorides, "chlorides", "g/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: chlorides
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 0.0875 g/L
## Mediana: 0.079 g/L
## Moda: 0.08 g/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 0.012 g/L
## Máximo: 0.611 g/L
## Rango: 0.599 g/L
## Q1 (25%): 0.07 g/L
## Q3 (75%): 0.09 g/L
## IQR: 0.02 g/L
## Varianza: 0.0022
## Desv. Est.: 0.0471 g/L
## Coef. Var.: 53.81 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 5.67
## Curtosis (kurtosis): 41.526
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 0.0540 0.0600 0.0700 0.0790 0.0900 0.1090 0.1261
graficar_variable(dat$chlorides, "chlorides", "#2980B9")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── free sulfur dioxide ───────────────────────────────────────────────────────
analizar_variable(dat$`free sulfur dioxide`, "free sulfur dioxide", "mg/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: free sulfur dioxide
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 15.8749 mg/L
## Mediana: 14 mg/L
## Moda: 6 mg/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 1 mg/L
## Máximo: 72 mg/L
## Rango: 71 mg/L
## Q1 (25%): 7 mg/L
## Q3 (75%): 21 mg/L
## IQR: 14 mg/L
## Varianza: 109.4149
## Desv. Est.: 10.4602 mg/L
## Coef. Var.: 65.89 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 1.248
## Curtosis (kurtosis): 2.007
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 4 5 7 14 21 31 35
graficar_variable(dat$`free sulfur dioxide`, "free sulfur dioxide", "#8E44AD")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── total sulfur dioxide ──────────────────────────────────────────────────────
analizar_variable(dat$`total sulfur dioxide`, "total sulfur dioxide", "mg/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: total sulfur dioxide
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 46.4678 mg/L
## Mediana: 38 mg/L
## Moda: 28 mg/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 6 mg/L
## Máximo: 289 mg/L
## Rango: 283 mg/L
## Q1 (25%): 22 mg/L
## Q3 (75%): 62 mg/L
## IQR: 40 mg/L
## Varianza: 1082.102
## Desv. Est.: 32.8953 mg/L
## Coef. Var.: 70.79 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 1.513
## Curtosis (kurtosis): 3.786
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 11.0 14.0 22.0 38.0 62.0 93.2 112.1
graficar_variable(dat$`total sulfur dioxide`, "total sulfur dioxide", "#D35400")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── density ───────────────────────────────────────────────────────────────────
analizar_variable(dat$density, "density", "g/cm3")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: density
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 0.9967 g/cm3
## Mediana: 0.9968 g/cm3
## Moda: 0.9972 g/cm3
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 0.9901 g/cm3
## Máximo: 1.0037 g/cm3
## Rango: 0.0136 g/cm3
## Q1 (25%): 0.9956 g/cm3
## Q3 (75%): 0.9978 g/cm3
## IQR: 0.0022 g/cm3
## Varianza: 0
## Desv. Est.: 0.0019 g/cm3
## Coef. Var.: 0.19 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 0.071
## Curtosis (kurtosis): 0.923
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 0.9936 0.9946 0.9956 0.9968 0.9978 0.9991 1.0000
graficar_variable(dat$density, "density", "#1ABC9C")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── pH ────────────────────────────────────────────────────────────────────────
analizar_variable(dat$pH, "pH", "")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: pH
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 3.3111
## Mediana: 3.31
## Moda: 3.3
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 2.74
## Máximo: 4.01
## Rango: 1.27
## Q1 (25%): 3.21
## Q3 (75%): 3.4
## IQR: 0.19
## Varianza: 0.0238
## Desv. Est.: 0.1544
## Coef. Var.: 4.66 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 0.193
## Curtosis (kurtosis): 0.796
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 3.06 3.12 3.21 3.31 3.40 3.51 3.57
graficar_variable(dat$pH, "pH", "#2ECC71")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── sulphates ─────────────────────────────────────────────────────────────────
analizar_variable(dat$sulphates, "sulphates", "g/L")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: sulphates
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 0.6581 g/L
## Mediana: 0.62 g/L
## Moda: 0.6 g/L
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 0.33 g/L
## Máximo: 2 g/L
## Rango: 1.67 g/L
## Q1 (25%): 0.55 g/L
## Q3 (75%): 0.73 g/L
## IQR: 0.18 g/L
## Varianza: 0.0287
## Desv. Est.: 0.1695 g/L
## Coef. Var.: 25.76 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 2.424
## Curtosis (kurtosis): 11.662
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 0.47 0.50 0.55 0.62 0.73 0.85 0.93
graficar_variable(dat$sulphates, "sulphates", "#E74C3C")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── alcohol ───────────────────────────────────────────────────────────────────
analizar_variable(dat$alcohol, "alcohol", "% vol")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
## VARIABLE: alcohol
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
##
## MEDIDAS DE TENDENCIA CENTRAL
## Media: 10.423 % vol
## Mediana: 10.2 % vol
## Moda: 9.5 % vol
##
## MEDIDAS DE DISPERSIÓN
## Mínimo: 8.4 % vol
## Máximo: 14.9 % vol
## Rango: 6.5 % vol
## Q1 (25%): 9.5 % vol
## Q3 (75%): 11.1 % vol
## IQR: 1.6 % vol
## Varianza: 1.1356
## Desv. Est.: 1.0657 % vol
## Coef. Var.: 10.22 %
##
## FORMA DE LA DISTRIBUCIÓN
## Asimetría (skewness): 0.859
## Curtosis (kurtosis): 0.192
##
## PERCENTILES
## 5% 10% 25% 50% 75% 90% 95%
## 9.2 9.3 9.5 10.2 11.1 12.0 12.5
graficar_variable(dat$alcohol, "alcohol", "#9B59B6")

## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
# ── quality ───────────────────────────────────────────────────────────────────
cat(rep("=", 60), "\n")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
cat(" VARIABLE: quality\n")
## VARIABLE: quality
cat(rep("=", 60), "\n\n")
## = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
cat("Tabla de frecuencias:\n")
## Tabla de frecuencias:
print(table(dat$quality))
##
## 3 4 5 6 7 8
## 10 53 681 638 199 18
cat("\nProporciones (%):\n")
##
## Proporciones (%):
print(round(prop.table(table(dat$quality)) * 100, 2))
##
## 3 4 5 6 7 8
## 0.63 3.31 42.59 39.90 12.45 1.13
dat$quality_factor <- factor(dat$quality)
p1 <- ggplot(dat, aes(x = quality_factor, fill = quality_factor)) +
geom_bar(color = "white", alpha = 0.9) +
scale_fill_brewer(palette = "RdYlGn") +
labs(title = "Distribución de Calidad", x = "Calidad", y = "Frecuencia") +
theme_minimal()
p2 <- ggplot(dat, aes(x = quality_factor,
y = after_stat(count) / sum(after_stat(count)),
fill = quality_factor)) +
geom_bar(color = "white", alpha = 0.9) +
scale_fill_brewer(palette = "RdYlGn") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proporción de Calidad", x = "Calidad", y = "Proporción") +
theme_minimal()
print(grid.arrange(p1, p2, ncol = 2))

## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
# Resumen estadístico
summary(dat[, 1:12])
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
round(stat.desc(dat[, 1:11], norm = TRUE), 3)
## fixed acidity volatile acidity citric acid residual sugar
## nbr.val 1599.000 1599.000 1599.000 1599.000
## nbr.null 0.000 0.000 132.000 0.000
## nbr.na 0.000 0.000 0.000 0.000
## min 4.600 0.120 0.000 0.900
## max 15.900 1.580 1.000 15.500
## range 11.300 1.460 1.000 14.600
## sum 13303.100 843.985 433.290 4059.550
## median 7.900 0.520 0.260 2.200
## mean 8.320 0.528 0.271 2.539
## SE.mean 0.044 0.004 0.005 0.035
## CI.mean.0.95 0.085 0.009 0.010 0.069
## var 3.031 0.032 0.038 1.988
## std.dev 1.741 0.179 0.195 1.410
## coef.var 0.209 0.339 0.719 0.555
## skewness 0.981 0.670 0.318 4.532
## skew.2SE 8.014 5.477 2.596 37.028
## kurtosis 1.120 1.213 -0.793 28.485
## kurt.2SE 4.577 4.957 -3.242 116.435
## normtest.W 0.942 0.974 0.955 0.566
## normtest.p 0.000 0.000 0.000 0.000
## chlorides free sulfur dioxide total sulfur dioxide density
## nbr.val 1599.000 1599.000 1599.000 1599.000
## nbr.null 0.000 0.000 0.000 0.000
## nbr.na 0.000 0.000 0.000 0.000
## min 0.012 1.000 6.000 0.990
## max 0.611 72.000 289.000 1.004
## range 0.599 71.000 283.000 0.014
## sum 139.859 25384.000 74302.000 1593.798
## median 0.079 14.000 38.000 0.997
## mean 0.087 15.875 46.468 0.997
## SE.mean 0.001 0.262 0.823 0.000
## CI.mean.0.95 0.002 0.513 1.614 0.000
## var 0.002 109.415 1082.102 0.000
## std.dev 0.047 10.460 32.895 0.002
## coef.var 0.538 0.659 0.708 0.002
## skewness 5.670 1.248 1.513 0.071
## skew.2SE 46.322 10.198 12.359 0.581
## kurtosis 41.526 2.007 3.786 0.923
## kurt.2SE 169.740 8.205 15.474 3.771
## normtest.W 0.484 0.902 0.873 0.991
## normtest.p 0.000 0.000 0.000 0.000
## pH sulphates alcohol
## nbr.val 1599.000 1599.000 1599.000
## nbr.null 0.000 0.000 0.000
## nbr.na 0.000 0.000 0.000
## min 2.740 0.330 8.400
## max 4.010 2.000 14.900
## range 1.270 1.670 6.500
## sum 5294.470 1052.380 16666.350
## median 3.310 0.620 10.200
## mean 3.311 0.658 10.423
## SE.mean 0.004 0.004 0.027
## CI.mean.0.95 0.008 0.008 0.052
## var 0.024 0.029 1.136
## std.dev 0.154 0.170 1.066
## coef.var 0.047 0.258 0.102
## skewness 0.193 2.424 0.859
## skew.2SE 1.579 19.805 7.020
## kurtosis 0.796 11.662 0.192
## kurt.2SE 3.253 47.667 0.783
## normtest.W 0.993 0.833 0.929
## normtest.p 0.000 0.000 0.000
# Análisis multivariado
# ── Coeficientes de variación comparados ──────────────────────────────────────
cv_tabla <- data.frame(
Variable = names(dat)[1:11],
CV_pct = sapply(dat[, 1:11], function(x) round(sd(x) / mean(x) * 100, 2))
)
cv_tabla <- cv_tabla[order(cv_tabla$CV_pct, decreasing = TRUE), ]
print(cv_tabla)
## Variable CV_pct
## citric acid citric acid 71.89
## total sulfur dioxide total sulfur dioxide 70.79
## free sulfur dioxide free sulfur dioxide 65.89
## residual sugar residual sugar 55.54
## chlorides chlorides 53.81
## volatile acidity volatile acidity 33.92
## sulphates sulphates 25.76
## fixed acidity fixed acidity 20.93
## alcohol alcohol 10.22
## pH pH 4.66
## density density 0.19
print(
ggplot(cv_tabla, aes(x = reorder(Variable, CV_pct), y = CV_pct, fill = CV_pct)) +
geom_col(show.legend = FALSE, alpha = 0.85) +
geom_text(aes(label = paste0(CV_pct, "%")), hjust = -0.1, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "#3498DB", high = "#C0392B") +
labs(title = "Coeficiente de Variación por Variable",
x = "Variable", y = "CV (%)") +
theme_minimal()
)

# ── Matriz de correlación ─────────────────────────────────────────────────────
matriz_cor <- cor(dat[, 1:12])
print(round(matriz_cor, 3))
## fixed acidity volatile acidity citric acid residual sugar
## fixed acidity 1.000 -0.256 0.672 0.115
## volatile acidity -0.256 1.000 -0.552 0.002
## citric acid 0.672 -0.552 1.000 0.144
## residual sugar 0.115 0.002 0.144 1.000
## chlorides 0.094 0.061 0.204 0.056
## free sulfur dioxide -0.154 -0.011 -0.061 0.187
## total sulfur dioxide -0.113 0.076 0.036 0.203
## density 0.668 0.022 0.365 0.355
## pH -0.683 0.235 -0.542 -0.086
## sulphates 0.183 -0.261 0.313 0.006
## alcohol -0.062 -0.202 0.110 0.042
## quality 0.124 -0.391 0.226 0.014
## chlorides free sulfur dioxide total sulfur dioxide density
## fixed acidity 0.094 -0.154 -0.113 0.668
## volatile acidity 0.061 -0.011 0.076 0.022
## citric acid 0.204 -0.061 0.036 0.365
## residual sugar 0.056 0.187 0.203 0.355
## chlorides 1.000 0.006 0.047 0.201
## free sulfur dioxide 0.006 1.000 0.668 -0.022
## total sulfur dioxide 0.047 0.668 1.000 0.071
## density 0.201 -0.022 0.071 1.000
## pH -0.265 0.070 -0.066 -0.342
## sulphates 0.371 0.052 0.043 0.149
## alcohol -0.221 -0.069 -0.206 -0.496
## quality -0.129 -0.051 -0.185 -0.175
## pH sulphates alcohol quality
## fixed acidity -0.683 0.183 -0.062 0.124
## volatile acidity 0.235 -0.261 -0.202 -0.391
## citric acid -0.542 0.313 0.110 0.226
## residual sugar -0.086 0.006 0.042 0.014
## chlorides -0.265 0.371 -0.221 -0.129
## free sulfur dioxide 0.070 0.052 -0.069 -0.051
## total sulfur dioxide -0.066 0.043 -0.206 -0.185
## density -0.342 0.149 -0.496 -0.175
## pH 1.000 -0.197 0.206 -0.058
## sulphates -0.197 1.000 0.094 0.251
## alcohol 0.206 0.094 1.000 0.476
## quality -0.058 0.251 0.476 1.000
corrplot(matriz_cor,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.60,
col = colorRampPalette(c("#3498DB", "white", "#C0392B"))(200),
title = "Matriz de Correlación - Vino Tinto",
mar = c(0, 0, 1, 0))

# ── Calidad vs variables clave ────────────────────────────────────────────────
vars_clave <- c("alcohol", "volatile acidity", "sulphates", "citric acid")
plots_cal <- lapply(vars_clave, function(v) {
df_p <- data.frame(cal = dat$quality_factor, val = dat[[v]])
ggplot(df_p, aes(x = cal, y = val, fill = cal)) +
geom_boxplot(alpha = 0.8, show.legend = FALSE) +
scale_fill_brewer(palette = "RdYlGn") +
labs(title = v, x = "Calidad", y = v) +
theme_minimal(base_size = 10)
})
print(grid.arrange(grobs = plots_cal, ncol = 2,
top = "Variables clave vs Calidad"))

## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.2207]